[2024-06-26 12:41:03,309][fairseq_cli.train][INFO] - {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 200, 'log_format': 'json', 'log_file': None, 'tensorboard_logdir': 'tblog', 'wandb_project': 'AVSP-LLM', 'azureml_logging': False, 'seed': 1337, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/home/theodore/Projects/VSP-LLM/src', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'no_c10d', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': True, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_algorithm': 'LocalSGD', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': None, 'batch_size': 1, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': None, 'batch_size_valid': 1, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': True, 'update_freq': [8], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False}, 'checkpoint': {'_name': None, 'save_dir': 'checkpoints', 'restore_file': 'checkpoint_last.pt', 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 2500, 'keep_interval_updates': 1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'accuracy', 'maximize_best_checkpoint_metric': True, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': {'_name': 'vsp_llm', 'w2v_path': '/home/theodore/Projects/VSP-LLM/checkpoints/large_vox_iter5.pt', 'llm_ckpt_path': 'vilm/vinallama-2.7b', 'apply_mask': False, 'mask_selection': 'static', 'mask_length': 10, 'mask_other': 0, 'mask_prob': 0.75, 'mask_channel_selection': 'static', 'mask_channel_length': 64, 'mask_channel_other': 0, 'mask_channel_prob': 0.5, 'layerdrop': 0.1, 'dropout': 0.0, 'activation_dropout': 0.1, 'attention_dropout': 0.0, 'feature_grad_mult': 1.0, 'encoder_embed_dim': 1024, 'decoder_embed_dim': 4096, 'freeze_finetune_updates': 18000}, 'task': {'_name': 'vsp_llm_training', 'is_s2s': True, 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/200h', 'label_dir': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/200h', 'normalize': True, 'labels': ['wrd'], 'single_target': True, 'fine_tuning': True, 'stack_order_audio': 4, 'max_sample_size': 500, 'modalities': ['video', 'audio'], 'image_aug': True, 'pad_audio': True, 'random_crop': False, 'llm_ckpt_path': 'vilm/vinallama-2.7b'}, 'criterion': {'_name': 'decoder_only_language_modeling_loss', 'report_accuracy': True, 'label_smoothing': 0.1}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9,0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'tri_stage', 'warmup_steps': 20000, 'hold_steps': 0, 'decay_steps': 40000, 'phase_ratio': None, 'init_lr_scale': 0.01, 'final_lr_scale': 0.05, 'max_update': 60000, 'lr': [0.0005]}, 'scoring': None, 'bpe': None, 'tokenizer': None, 'job_logging_cfg': {'version': 1, 'formatters': {'simple': {'format': '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'}}, 'handlers': {'console': {'class': 'logging.StreamHandler', 'formatter': 'simple', 'stream': 'ext://sys.stdout'}, 'file': {'class': 'logging.FileHandler', 'formatter': 'simple', 'filename': 'hydra_train.log'}}, 'root': {'level': 'INFO', 'handlers': ['console', 'file']}, 'disable_existing_loggers': False}} [2024-06-26 12:41:03,312][src.vsp_llm_training][INFO] - current directory is /home/theodore/Projects/VSP-LLM/experiments/ViAVSP-LLM_v1.3 [2024-06-26 12:41:03,312][src.vsp_llm_training][INFO] - AVHubertPretrainingTask Config {'_name': 'vsp_llm_training', 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/200h', 'labels': ['wrd'], 'label_dir': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/200h', 'label_rate': -1, 'sample_rate': 16000, 'llm_ckpt_path': 'vilm/vinallama-2.7b', 'normalize': True, 'enable_padding': False, 'max_sample_size': 500, 'min_sample_size': None, 'max_trim_sample_size': '${task.max_sample_size}', 'single_target': True, 'random_crop': False, 'pad_audio': True, 'pdb': False, 'stack_order_audio': 4, 'skip_verify': False, 'image_aug': True, 'image_crop_size': 88, 'image_mean': 0.421, 'image_std': 0.165, 'modalities': ['video', 'audio'], 'is_s2s': True, 'tokenizer_bpe_name': None, 'tokenizer_bpe_model': None, 'noise_wav': None, 'noise_prob': 0.0, 'noise_snr': '0', 'noise_num': 1, 'fine_tuning': True} [2024-06-26 12:41:05,207][src.hubert_pretraining][INFO] - current directory is /home/theodore/Projects/VSP-LLM/experiments/ViAVSP-LLM_v1.3 [2024-06-26 12:41:05,207][src.hubert_pretraining][INFO] - AVHubertPretrainingTask Config {'_name': 'av_hubert_pretraining', 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/200h', 'labels': ['km'], 'label_dir': '/checkpoint/bshi/data/lrs3//video/hubert/stitch-iters/envox-iter4-l12c2000/', 'label_rate': 25, 'sample_rate': 25, 'normalize': True, 'enable_padding': False, 'max_sample_size': 2000, 'min_sample_size': 5, 'max_trim_sample_size': 400, 'single_target': False, 'random_crop': True, 'pad_audio': False, 'pdb': False, 'stack_order_audio': 4, 'skip_verify': False, 'image_aug': True, 'image_crop_size': 88, 'image_mean': 0.421, 'image_std': 0.165, 'modalities': ['audio', 'video'], 'is_s2s': False, 'tokenizer_bpe_name': None, 'tokenizer_bpe_model': None, 'noise_wav': None, 'noise_prob': 0.0, 'noise_snr': '0', 'noise_num': 1, 'fine_tuning': False} [2024-06-26 12:41:05,211][src.hubert][INFO] - HubertModel Config: {'_name': 'av_hubert', 'label_rate': 25, 'input_modality': '${task.input_modality}', 'extractor_mode': default, 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': gelu, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.1, 'encoder_layerdrop': 0.1, 'dropout_input': 0.0, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 1.0, 'mask_length_audio': 10, 'mask_prob_audio': 0.8, 'mask_length_image': 5, 'mask_prob_image': 0.3, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'resnet_relu_type': 'prelu', 'resnet_weights': None, 'sim_type': 'cosine', 'sub_encoder_layers': 0, 'audio_feat_dim': 104, 'modality_dropout': 0.5, 'audio_dropout': 0.5, 'modality_fuse': 'concat', 'selection_type': 'same_seq', 'masking_type': 'input', 'decoder_embed_dim': 768, 'decoder_ffn_embed_dim': 3072, 'decoder_layers': 6, 'decoder_layerdrop': 0.0, 'decoder_attention_heads': 4, 'decoder_learned_pos': False, 'decoder_normalize_before': False, 'no_token_positional_embeddings': False, 'decoder_dropout': 0.1, 'decoder_attention_dropout': 0.1, 'decoder_activation_dropout': 0.0, 'max_target_positions': 2048, 'share_decoder_input_output_embed': False, 'no_scale_embedding': True} [2024-06-26 12:41:12,642][fairseq_cli.train][INFO] - avhubert_llm_seq2seq_cluster_count( (encoder): HubertEncoderWrapper( (w2v_model): AVHubertModel( (feature_extractor_audio): SubModel( (proj): Linear(in_features=104, out_features=1024, bias=True) ) (feature_extractor_video): SubModel( (resnet): ResEncoder( (frontend3D): Sequential( (0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False) (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (2): PReLU(num_parameters=64) (3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False) ) (trunk): ResNet( (layer1): Sequential( (0): BasicBlock( (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu1): PReLU(num_parameters=64) (relu2): PReLU(num_parameters=64) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) (1): BasicBlock( (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu1): PReLU(num_parameters=64) (relu2): PReLU(num_parameters=64) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (layer2): Sequential( (0): BasicBlock( (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu1): PReLU(num_parameters=128) (relu2): PReLU(num_parameters=128) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (downsample): Sequential( (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu1): PReLU(num_parameters=128) (relu2): PReLU(num_parameters=128) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (layer3): Sequential( (0): BasicBlock( (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu1): PReLU(num_parameters=256) (relu2): PReLU(num_parameters=256) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (downsample): Sequential( (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu1): PReLU(num_parameters=256) (relu2): PReLU(num_parameters=256) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (layer4): Sequential( (0): BasicBlock( (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu1): PReLU(num_parameters=512) (relu2): PReLU(num_parameters=512) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (downsample): Sequential( (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu1): PReLU(num_parameters=512) (relu2): PReLU(num_parameters=512) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (avgpool): AdaptiveAvgPool2d(output_size=1) ) ) (proj): Linear(in_features=512, out_features=1024, bias=True) ) (post_extract_proj): Linear(in_features=2048, out_features=1024, bias=True) (dropout_input): Dropout(p=0.0, inplace=False) (dropout_features): Dropout(p=0.1, inplace=False) (encoder): TransformerEncoder( (pos_conv): Sequential( (0): Conv1d(1024, 1024, kernel_size=(128,), stride=(1,), padding=(64,), groups=16) (1): SamePad() (2): GELU(approximate='none') ) (layers): ModuleList( (0-23): 24 x TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) ) (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True) (final_proj): None ) ) (decoder): PeftModelForCausalLM( (base_model): LoraModel( (model): LlamaForCausalLM( (model): LlamaModel( (embed_tokens): Embedding(46304, 2560, padding_idx=0) (layers): ModuleList( (0-31): 32 x LlamaDecoderLayer( (self_attn): LlamaSdpaAttention( (q_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2560, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=2560, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (k_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2560, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=2560, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (v_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2560, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=2560, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False) (rotary_emb): LlamaRotaryEmbedding() ) (mlp): LlamaMLP( (gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False) (up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False) (down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False) (act_fn): SiLU() ) (input_layernorm): LlamaRMSNorm() (post_attention_layernorm): LlamaRMSNorm() ) ) (norm): LlamaRMSNorm() ) (lm_head): Linear(in_features=2560, out_features=46304, bias=False) ) ) ) (avfeat_to_llm): Linear(in_features=1024, out_features=2560, bias=True) ) [2024-06-26 12:41:12,647][fairseq_cli.train][INFO] - task: VSP_LLM_TrainingTask [2024-06-26 12:41:12,647][fairseq_cli.train][INFO] - model: avhubert_llm_seq2seq_cluster_count [2024-06-26 12:41:12,647][fairseq_cli.train][INFO] - criterion: decoder_only_language_modeling_loss [2024-06-26 12:41:12,650][fairseq_cli.train][INFO] - num. shared model params: 1,841,644,264 (num. trained: 335,624,424) [2024-06-26 12:41:12,653][fairseq_cli.train][INFO] - num. expert model params: 0 (num. trained: 0) [2024-06-26 12:41:12,654][src.vsp_llm_training][INFO] - Using tokenizer [2024-06-26 12:41:12,693][src.vsp_llm_dataset][INFO] - max_keep=500, min_keep=None, loaded 23990, skipped 0 short and 0 long and 0 unaligned, longest-loaded=76, shortest-loaded=76 [2024-06-26 12:41:13,028][src.vsp_llm_dataset][INFO] - /home/theodore/Projects/VSP-LLM/data/processed/vasr/200h/valid.wrd is sequence label. skipped [2024-06-26 12:41:13,029][src.vsp_llm_dataset][INFO] - image transform: Compose( Normalize(mean=0.0, std=255.0) Normalize(mean=0.421, std=0.165) ) [2024-06-26 12:41:13,029][src.vsp_llm_dataset][INFO] - pad_audio=True, random_crop=False, normalize=True, max_sample_size=500, seqs2seq data=True, [2024-06-26 12:41:13,029][src.vsp_llm_dataset][INFO] - Noise wav: None->0 wav, Prob: 0.0, SNR: 0, Number of mixture: 1 [2024-06-26 12:41:13,206][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer1.0.conv1.bias [2024-06-26 12:41:13,206][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer1.0.conv2.bias [2024-06-26 12:41:13,206][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer1.1.conv1.bias [2024-06-26 12:41:13,206][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer1.1.conv2.bias [2024-06-26 12:41:13,206][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.0.conv1.bias [2024-06-26 12:41:13,206][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.0.conv2.bias [2024-06-26 12:41:13,206][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.0.downsample.0.bias [2024-06-26 12:41:13,206][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.1.conv1.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.1.conv2.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.0.conv1.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.0.conv2.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.0.downsample.0.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.1.conv1.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.1.conv2.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.0.conv1.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.0.conv2.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.0.downsample.0.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.1.conv1.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.1.conv2.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.o_proj.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.mlp.gate_proj.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.mlp.up_proj.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.mlp.down_proj.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.o_proj.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.mlp.gate_proj.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.mlp.up_proj.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.mlp.down_proj.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,207][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.o_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.mlp.gate_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.mlp.up_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.mlp.down_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.o_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.mlp.gate_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.mlp.up_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.mlp.down_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.o_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.mlp.gate_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.mlp.up_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.mlp.down_proj.bias [2024-06-26 12:41:13,208][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.o_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.mlp.gate_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.mlp.up_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.mlp.down_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.o_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.mlp.gate_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.mlp.up_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.mlp.down_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.o_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.mlp.gate_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.mlp.up_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.mlp.down_proj.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,209][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.o_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.mlp.gate_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.mlp.up_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.mlp.down_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.o_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.mlp.gate_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.mlp.up_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.mlp.down_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.o_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.mlp.gate_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.mlp.up_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.mlp.down_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.o_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.mlp.gate_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.mlp.up_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.mlp.down_proj.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,210][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.o_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.mlp.gate_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.mlp.up_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.mlp.down_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.o_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.mlp.gate_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.mlp.up_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.mlp.down_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.o_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.mlp.gate_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.mlp.up_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.mlp.down_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.o_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.mlp.gate_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.mlp.up_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.mlp.down_proj.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,211][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.o_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.mlp.gate_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.mlp.up_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.mlp.down_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.o_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.mlp.gate_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.mlp.up_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.mlp.down_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.o_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.mlp.gate_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.mlp.up_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.mlp.down_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.o_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.mlp.gate_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.mlp.up_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.mlp.down_proj.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,212][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.o_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.mlp.gate_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.mlp.up_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.mlp.down_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.o_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.mlp.gate_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.mlp.up_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.mlp.down_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.o_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.mlp.gate_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.mlp.up_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.mlp.down_proj.bias [2024-06-26 12:41:13,213][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.o_proj.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.mlp.gate_proj.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.mlp.up_proj.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.mlp.down_proj.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.o_proj.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.mlp.gate_proj.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.mlp.up_proj.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.mlp.down_proj.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,214][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.o_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.mlp.gate_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.mlp.up_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.mlp.down_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.o_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.mlp.gate_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.mlp.up_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.mlp.down_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.o_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.mlp.gate_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.mlp.up_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.mlp.down_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.o_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.mlp.gate_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.mlp.up_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.mlp.down_proj.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,215][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.o_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.mlp.gate_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.mlp.up_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.mlp.down_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.o_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.mlp.gate_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.mlp.up_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.mlp.down_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.q_proj.base_layer.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.q_proj.lora_A.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.q_proj.lora_B.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.k_proj.base_layer.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.v_proj.base_layer.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.v_proj.lora_A.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.o_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.mlp.gate_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.mlp.up_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.mlp.down_proj.bias [2024-06-26 12:41:13,216][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.lm_head.bias [2024-06-26 12:41:13,216][fairseq.utils][INFO] - ***********************CUDA enviroments for all 1 workers*********************** [2024-06-26 12:41:13,216][fairseq.utils][INFO] - rank 0: capabilities = 8.6 ; total memory = 15.729 GB ; name = NVIDIA RTX A4000 [2024-06-26 12:41:13,216][fairseq.utils][INFO] - ***********************CUDA enviroments for all 1 workers*********************** [2024-06-26 12:41:13,217][fairseq_cli.train][INFO] - training on 1 devices (GPUs/TPUs) [2024-06-26 12:41:13,217][fairseq_cli.train][INFO] - max tokens per device = None and max sentences per device = 1 [2024-06-26 12:41:13,217][fairseq.trainer][INFO] - Preparing to load checkpoint checkpoints/checkpoint_last.pt [2024-06-26 12:41:13,217][fairseq.trainer][INFO] - No existing checkpoint found checkpoints/checkpoint_last.pt [2024-06-26 12:41:13,217][fairseq.trainer][INFO] - loading train data for epoch 1 [2024-06-26 12:41:13,217][src.vsp_llm_training][INFO] - Using tokenizer [2024-06-26 12:41:13,561][src.vsp_llm_dataset][INFO] - max_keep=500, min_keep=None, loaded 241351, skipped 0 short and 0 long and 0 unaligned, longest-loaded=76, shortest-loaded=74 [2024-06-26 12:41:14,044][src.vsp_llm_dataset][INFO] - /home/theodore/Projects/VSP-LLM/data/processed/vasr/200h/train.wrd is sequence label. skipped [2024-06-26 12:41:14,044][src.vsp_llm_dataset][INFO] - image transform: Compose( Normalize(mean=0.0, std=255.0) RandomCrop(size=(88, 88)) Normalize(mean=0.421, std=0.165) ) [2024-06-26 12:41:14,044][src.vsp_llm_dataset][INFO] - pad_audio=True, random_crop=False, normalize=True, max_sample_size=500, seqs2seq data=True, [2024-06-26 12:41:14,044][src.vsp_llm_dataset][INFO] - Noise wav: None->0 wav, Prob: 0.0, SNR: 0, Number of mixture: 1 [2024-06-26 12:41:18,160][fairseq.trainer][INFO] - begin training epoch 1 [2024-06-26 12:41:18,160][fairseq_cli.train][INFO] - Start iterating over samples [2024-06-26 12:46:45,841][train_inner][INFO] - {"epoch": 1, "update": 0.007, "loss": "7.725", "ntokens": "126.33", "acc_total": "126.33", "n_correct": "17.775", "wer_total": "126.33", "n_error": "108.55", "ppl": "211.52", "accuracy": "14.07", "wer": "85.926", "wps": "77.3", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "200", "lr": "9.95e-06", "gnorm": "8.94", "loss_scale": "128", "train_wall": "327", "gb_free": "7.1", "wall": "333"} [2024-06-26 12:52:15,148][train_inner][INFO] - {"epoch": 1, "update": 0.013, "loss": "6.341", "ntokens": "125.77", "acc_total": "125.77", "n_correct": "24.705", "wer_total": "125.77", "n_error": "100.88", "ppl": "81.09", "accuracy": "19.643", "wer": "80.21", "wps": "76.4", "ups": "0.61", "wpb": "125.8", "bsz": "8", "num_updates": "400", "lr": "1.49e-05", "gnorm": "3.79", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "662"} [2024-06-26 12:57:44,638][train_inner][INFO] - {"epoch": 1, "update": 0.02, "loss": "6.067", "ntokens": "125.865", "acc_total": "125.865", "n_correct": "28.075", "wer_total": "125.865", "n_error": "97.5", "ppl": "67.02", "accuracy": "22.306", "wer": "77.464", "wps": "76.4", "ups": "0.61", "wpb": "125.9", "bsz": "8", "num_updates": "600", "lr": "1.985e-05", "gnorm": "3.784", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "991"} [2024-06-26 13:03:14,253][train_inner][INFO] - {"epoch": 1, "update": 0.027, "loss": "5.951", "ntokens": "127.225", "acc_total": "127.225", "n_correct": "29.78", "wer_total": "127.225", "n_error": "97.185", "ppl": "61.86", "accuracy": "23.407", "wer": "76.388", "wps": "77.2", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "800", "lr": "2.48e-05", "gnorm": "4.084", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "1321"} [2024-06-26 13:08:43,569][train_inner][INFO] - {"epoch": 1, "update": 0.033, "loss": "5.865", "ntokens": "127.265", "acc_total": "127.265", "n_correct": "30.695", "wer_total": "127.265", "n_error": "96.31", "ppl": "58.29", "accuracy": "24.119", "wer": "75.677", "wps": "77.3", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "1000", "lr": "2.975e-05", "gnorm": "4.239", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "1650"} [2024-06-26 13:14:13,021][train_inner][INFO] - {"epoch": 1, "update": 0.04, "loss": "5.826", "ntokens": "127.445", "acc_total": "127.445", "n_correct": "30.85", "wer_total": "127.445", "n_error": "96.375", "ppl": "56.71", "accuracy": "24.207", "wer": "75.621", "wps": "77.4", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "1200", "lr": "3.47e-05", "gnorm": "4.198", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "1980"} [2024-06-26 13:19:42,605][train_inner][INFO] - {"epoch": 1, "update": 0.046, "loss": "5.849", "ntokens": "126.29", "acc_total": "126.29", "n_correct": "30.895", "wer_total": "126.29", "n_error": "95.155", "ppl": "57.65", "accuracy": "24.464", "wer": "75.346", "wps": "76.6", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "1400", "lr": "3.965e-05", "gnorm": "4.26", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "2309"} [2024-06-26 13:25:12,413][train_inner][INFO] - {"epoch": 1, "update": 0.053, "loss": "5.784", "ntokens": "128.63", "acc_total": "128.63", "n_correct": "32.1", "wer_total": "128.63", "n_error": "96.27", "ppl": "55.09", "accuracy": "24.955", "wer": "74.843", "wps": "78", "ups": "0.61", "wpb": "128.6", "bsz": "8", "num_updates": "1600", "lr": "4.46e-05", "gnorm": "4.132", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "2639"} [2024-06-26 13:30:41,949][train_inner][INFO] - {"epoch": 1, "update": 0.06, "loss": "5.795", "ntokens": "127.18", "acc_total": "127.18", "n_correct": "32.165", "wer_total": "127.18", "n_error": "94.755", "ppl": "55.52", "accuracy": "25.291", "wer": "74.505", "wps": "77.2", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "1800", "lr": "4.955e-05", "gnorm": "3.789", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "2969"} [2024-06-26 13:36:10,929][train_inner][INFO] - {"epoch": 1, "update": 0.066, "loss": "5.679", "ntokens": "126.5", "acc_total": "126.5", "n_correct": "32.72", "wer_total": "126.5", "n_error": "93.56", "ppl": "51.24", "accuracy": "25.866", "wer": "73.96", "wps": "76.9", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "2000", "lr": "5.45e-05", "gnorm": "3.658", "loss_scale": "128", "train_wall": "328", "gb_free": "7.1", "wall": "3298"} [2024-06-26 13:41:40,461][train_inner][INFO] - {"epoch": 1, "update": 0.073, "loss": "5.703", "ntokens": "126.885", "acc_total": "126.885", "n_correct": "32.495", "wer_total": "126.885", "n_error": "94.055", "ppl": "52.1", "accuracy": "25.61", "wer": "74.126", "wps": "77", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "2200", "lr": "5.945e-05", "gnorm": "3.495", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "3627"} [2024-06-26 13:47:09,796][train_inner][INFO] - {"epoch": 1, "update": 0.08, "loss": "5.728", "ntokens": "127.585", "acc_total": "127.585", "n_correct": "32.93", "wer_total": "127.585", "n_error": "94.455", "ppl": "52.98", "accuracy": "25.81", "wer": "74.033", "wps": "77.5", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "2400", "lr": "6.44e-05", "gnorm": "3.373", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "3957"} [2024-06-26 13:49:54,600][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-26 14:31:53,238][valid][INFO] - {"epoch": 1, "valid_loss": "5.543", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "4.87924", "valid_wer_total": "18.1585", "valid_n_error": "13.2391", "valid_ppl": "46.62", "valid_accuracy": "26.87", "valid_wer": "72.909", "valid_wps": "173", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "2500"} [2024-06-26 14:31:53,239][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 2500 updates [2024-06-26 14:31:53,239][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_2500.pt [2024-06-26 14:31:56,521][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_2500.pt [2024-06-26 14:31:59,553][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_2500.pt (epoch 1 @ 2500 updates, score 26.87) (writing took 6.314380053896457 seconds) [2024-06-26 14:34:44,196][train_inner][INFO] - {"epoch": 1, "update": 0.086, "loss": "5.601", "ntokens": "127.89", "acc_total": "127.89", "n_correct": "34.65", "wer_total": "127.89", "n_error": "93.02", "ppl": "48.53", "accuracy": "27.094", "wer": "72.734", "wps": "9", "ups": "0.07", "wpb": "127.9", "bsz": "8", "num_updates": "2600", "lr": "6.935e-05", "gnorm": "3.391", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "6811"} [2024-06-26 14:40:13,985][train_inner][INFO] - {"epoch": 1, "update": 0.093, "loss": "5.56", "ntokens": "128.195", "acc_total": "128.195", "n_correct": "35.89", "wer_total": "128.195", "n_error": "92.065", "ppl": "47.17", "accuracy": "27.996", "wer": "71.816", "wps": "77.7", "ups": "0.61", "wpb": "128.2", "bsz": "8", "num_updates": "2800", "lr": "7.43e-05", "gnorm": "3.544", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "7141"} [2024-06-26 14:45:43,492][train_inner][INFO] - {"epoch": 1, "update": 0.099, "loss": "5.465", "ntokens": "126.99", "acc_total": "126.99", "n_correct": "36.66", "wer_total": "126.99", "n_error": "90.035", "ppl": "44.18", "accuracy": "28.868", "wer": "70.899", "wps": "77.1", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "3000", "lr": "7.925e-05", "gnorm": "3.583", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "7470"} [2024-06-26 14:51:13,166][train_inner][INFO] - {"epoch": 1, "update": 0.106, "loss": "5.361", "ntokens": "126.46", "acc_total": "126.46", "n_correct": "38.415", "wer_total": "126.46", "n_error": "87.775", "ppl": "41.1", "accuracy": "30.377", "wer": "69.409", "wps": "76.7", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "3200", "lr": "8.42e-05", "gnorm": "3.929", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "7800"} [2024-06-26 14:56:42,927][train_inner][INFO] - {"epoch": 1, "update": 0.113, "loss": "5.231", "ntokens": "126.495", "acc_total": "126.495", "n_correct": "40.87", "wer_total": "126.495", "n_error": "85.39", "ppl": "37.56", "accuracy": "32.31", "wer": "67.505", "wps": "76.7", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "3400", "lr": "8.915e-05", "gnorm": "4.264", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "8130"} [2024-06-26 15:02:12,619][train_inner][INFO] - {"epoch": 1, "update": 0.119, "loss": "5.067", "ntokens": "127.44", "acc_total": "127.44", "n_correct": "43.94", "wer_total": "127.44", "n_error": "83.24", "ppl": "33.52", "accuracy": "34.479", "wer": "65.317", "wps": "77.3", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "3600", "lr": "9.41e-05", "gnorm": "4.611", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "8459"} [2024-06-26 15:07:42,391][train_inner][INFO] - {"epoch": 1, "update": 0.126, "loss": "4.884", "ntokens": "126.75", "acc_total": "126.75", "n_correct": "47.18", "wer_total": "126.75", "n_error": "79.34", "ppl": "29.53", "accuracy": "37.223", "wer": "62.596", "wps": "76.9", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "3800", "lr": "9.905e-05", "gnorm": "4.914", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "8789"} [2024-06-26 15:13:11,905][train_inner][INFO] - {"epoch": 1, "update": 0.133, "loss": "4.682", "ntokens": "126.045", "acc_total": "126.045", "n_correct": "49.04", "wer_total": "126.045", "n_error": "76.785", "ppl": "25.67", "accuracy": "38.907", "wer": "60.919", "wps": "76.5", "ups": "0.61", "wpb": "126", "bsz": "8", "num_updates": "4000", "lr": "0.000104", "gnorm": "5.162", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "9119"} [2024-06-26 15:18:41,697][train_inner][INFO] - {"epoch": 1, "update": 0.139, "loss": "4.512", "ntokens": "126.485", "acc_total": "126.485", "n_correct": "51.475", "wer_total": "126.485", "n_error": "74.815", "ppl": "22.82", "accuracy": "40.697", "wer": "59.149", "wps": "76.7", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "4200", "lr": "0.00010895", "gnorm": "5.254", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "9448"} [2024-06-26 15:24:11,307][train_inner][INFO] - {"epoch": 1, "update": 0.146, "loss": "4.285", "ntokens": "127.475", "acc_total": "127.475", "n_correct": "54.76", "wer_total": "127.475", "n_error": "72.535", "ppl": "19.49", "accuracy": "42.957", "wer": "56.901", "wps": "77.3", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "4400", "lr": "0.0001139", "gnorm": "5.523", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "9778"} [2024-06-26 15:29:41,024][train_inner][INFO] - {"epoch": 1, "update": 0.152, "loss": "4.315", "ntokens": "126.705", "acc_total": "126.705", "n_correct": "54.625", "wer_total": "126.705", "n_error": "71.915", "ppl": "19.9", "accuracy": "43.112", "wer": "56.758", "wps": "76.9", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "4600", "lr": "0.00011885", "gnorm": "5.524", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "10108"} [2024-06-26 15:35:10,610][train_inner][INFO] - {"epoch": 1, "update": 0.159, "loss": "4.082", "ntokens": "127.5", "acc_total": "127.5", "n_correct": "57.69", "wer_total": "127.5", "n_error": "69.675", "ppl": "16.93", "accuracy": "45.247", "wer": "54.647", "wps": "77.4", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "4800", "lr": "0.0001238", "gnorm": "5.489", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "10437"} [2024-06-26 15:40:40,440][train_inner][INFO] - {"epoch": 1, "update": 0.166, "loss": "3.945", "ntokens": "127.91", "acc_total": "127.91", "n_correct": "59.66", "wer_total": "127.91", "n_error": "68", "ppl": "15.4", "accuracy": "46.642", "wer": "53.162", "wps": "77.6", "ups": "0.61", "wpb": "127.9", "bsz": "8", "num_updates": "5000", "lr": "0.00012875", "gnorm": "5.634", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "10767"} [2024-06-26 15:40:40,440][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-26 16:22:38,616][valid][INFO] - {"epoch": 1, "valid_loss": "3.664", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "9.09033", "valid_wer_total": "18.1585", "valid_n_error": "9.0484", "valid_ppl": "12.67", "valid_accuracy": "50.061", "valid_wer": "49.83", "valid_wps": "173", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "5000", "valid_best_accuracy": "50.061"} [2024-06-26 16:22:38,617][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 5000 updates [2024-06-26 16:22:38,617][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_5000.pt [2024-06-26 16:22:41,837][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_5000.pt [2024-06-26 16:22:45,773][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_5000.pt (epoch 1 @ 5000 updates, score 50.061) (writing took 7.156597583089024 seconds) [2024-06-26 16:28:15,062][train_inner][INFO] - {"epoch": 1, "update": 0.172, "loss": "3.909", "ntokens": "126.95", "acc_total": "126.95", "n_correct": "59.845", "wer_total": "126.95", "n_error": "67.005", "ppl": "15.02", "accuracy": "47.141", "wer": "52.781", "wps": "8.9", "ups": "0.07", "wpb": "127", "bsz": "8", "num_updates": "5200", "lr": "0.0001337", "gnorm": "5.622", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "13622"} [2024-06-26 16:33:44,695][train_inner][INFO] - {"epoch": 1, "update": 0.179, "loss": "3.782", "ntokens": "126.17", "acc_total": "126.17", "n_correct": "61.455", "wer_total": "126.17", "n_error": "64.55", "ppl": "13.75", "accuracy": "48.708", "wer": "51.161", "wps": "76.6", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "5400", "lr": "0.00013865", "gnorm": "5.452", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "13951"} [2024-06-26 16:39:14,280][train_inner][INFO] - {"epoch": 1, "update": 0.186, "loss": "3.611", "ntokens": "127.265", "acc_total": "127.265", "n_correct": "63.52", "wer_total": "127.265", "n_error": "63.61", "ppl": "12.22", "accuracy": "49.912", "wer": "49.982", "wps": "77.2", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "5600", "lr": "0.0001436", "gnorm": "5.503", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "14281"} [2024-06-26 16:44:43,764][train_inner][INFO] - {"epoch": 1, "update": 0.192, "loss": "3.609", "ntokens": "126.9", "acc_total": "126.9", "n_correct": "63.305", "wer_total": "126.9", "n_error": "63.465", "ppl": "12.2", "accuracy": "49.886", "wer": "50.012", "wps": "77", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "5800", "lr": "0.00014855", "gnorm": "5.543", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "14611"} [2024-06-26 16:50:13,220][train_inner][INFO] - {"epoch": 1, "update": 0.199, "loss": "3.449", "ntokens": "127.085", "acc_total": "127.085", "n_correct": "65.42", "wer_total": "127.085", "n_error": "61.535", "ppl": "10.92", "accuracy": "51.477", "wer": "48.42", "wps": "77.1", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "6000", "lr": "0.0001535", "gnorm": "5.35", "loss_scale": "512", "train_wall": "329", "gb_free": "7.1", "wall": "14940"} [2024-06-26 16:55:42,704][train_inner][INFO] - {"epoch": 1, "update": 0.206, "loss": "3.398", "ntokens": "125.755", "acc_total": "125.755", "n_correct": "65.695", "wer_total": "125.755", "n_error": "59.925", "ppl": "10.54", "accuracy": "52.24", "wer": "47.652", "wps": "76.3", "ups": "0.61", "wpb": "125.8", "bsz": "8", "num_updates": "6200", "lr": "0.00015845", "gnorm": "5.477", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "15269"} [2024-06-26 17:01:12,000][train_inner][INFO] - {"epoch": 1, "update": 0.212, "loss": "3.406", "ntokens": "126.855", "acc_total": "126.855", "n_correct": "66.36", "wer_total": "126.855", "n_error": "60.385", "ppl": "10.6", "accuracy": "52.312", "wer": "47.602", "wps": "77", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "6400", "lr": "0.0001634", "gnorm": "5.339", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "15599"} [2024-06-26 17:06:41,312][train_inner][INFO] - {"epoch": 1, "update": 0.219, "loss": "3.232", "ntokens": "127.655", "acc_total": "127.655", "n_correct": "69.265", "wer_total": "127.655", "n_error": "58.295", "ppl": "9.4", "accuracy": "54.26", "wer": "45.666", "wps": "77.5", "ups": "0.61", "wpb": "127.7", "bsz": "8", "num_updates": "6600", "lr": "0.00016835", "gnorm": "5.207", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "15928"} [2024-06-26 17:12:10,461][train_inner][INFO] - {"epoch": 1, "update": 0.225, "loss": "3.104", "ntokens": "127.66", "acc_total": "127.66", "n_correct": "70.32", "wer_total": "127.66", "n_error": "57.26", "ppl": "8.6", "accuracy": "55.084", "wer": "44.854", "wps": "77.6", "ups": "0.61", "wpb": "127.7", "bsz": "8", "num_updates": "6800", "lr": "0.0001733", "gnorm": "5.081", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "16257"} [2024-06-26 17:17:39,643][train_inner][INFO] - {"epoch": 1, "update": 0.232, "loss": "3.164", "ntokens": "126.98", "acc_total": "126.98", "n_correct": "69.355", "wer_total": "126.98", "n_error": "57.47", "ppl": "8.96", "accuracy": "54.619", "wer": "45.259", "wps": "77.1", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "7000", "lr": "0.00017825", "gnorm": "5.135", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "16586"} [2024-06-26 17:23:08,799][train_inner][INFO] - {"epoch": 1, "update": 0.239, "loss": "3.049", "ntokens": "127.19", "acc_total": "127.19", "n_correct": "70.57", "wer_total": "127.19", "n_error": "56.515", "ppl": "8.28", "accuracy": "55.484", "wer": "44.434", "wps": "77.3", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "7200", "lr": "0.0001832", "gnorm": "4.996", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "16916"} [2024-06-26 17:28:38,234][train_inner][INFO] - {"epoch": 1, "update": 0.245, "loss": "3.048", "ntokens": "125.55", "acc_total": "125.55", "n_correct": "69.78", "wer_total": "125.55", "n_error": "55.695", "ppl": "8.27", "accuracy": "55.579", "wer": "44.361", "wps": "76.2", "ups": "0.61", "wpb": "125.5", "bsz": "8", "num_updates": "7400", "lr": "0.00018815", "gnorm": "5.035", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "17245"} [2024-06-26 17:31:22,721][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-26 18:13:20,789][valid][INFO] - {"epoch": 1, "valid_loss": "2.719", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "10.8264", "valid_wer_total": "18.1585", "valid_n_error": "7.32055", "valid_ppl": "6.58", "valid_accuracy": "59.622", "valid_wer": "40.315", "valid_wps": "173", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "7500", "valid_best_accuracy": "59.622"} [2024-06-26 18:13:20,790][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 7500 updates [2024-06-26 18:13:20,790][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_7500.pt [2024-06-26 18:13:24,085][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_7500.pt [2024-06-26 18:13:29,512][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_7500.pt (epoch 1 @ 7500 updates, score 59.622) (writing took 8.721956300083548 seconds) [2024-06-26 18:16:13,762][train_inner][INFO] - {"epoch": 1, "update": 0.252, "loss": "2.964", "ntokens": "126.9", "acc_total": "126.9", "n_correct": "71.785", "wer_total": "126.9", "n_error": "55.03", "ppl": "7.8", "accuracy": "56.568", "wer": "43.365", "wps": "8.9", "ups": "0.07", "wpb": "126.9", "bsz": "8", "num_updates": "7600", "lr": "0.0001931", "gnorm": "4.907", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "20101"} [2024-06-26 18:21:43,182][train_inner][INFO] - {"epoch": 1, "update": 0.259, "loss": "2.915", "ntokens": "126.055", "acc_total": "126.055", "n_correct": "72.065", "wer_total": "126.055", "n_error": "53.84", "ppl": "7.54", "accuracy": "57.169", "wer": "42.712", "wps": "76.5", "ups": "0.61", "wpb": "126.1", "bsz": "8", "num_updates": "7800", "lr": "0.00019805", "gnorm": "4.797", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "20430"} [2024-06-26 18:27:12,529][train_inner][INFO] - {"epoch": 1, "update": 0.265, "loss": "2.879", "ntokens": "126.355", "acc_total": "126.355", "n_correct": "73.105", "wer_total": "126.355", "n_error": "53.145", "ppl": "7.36", "accuracy": "57.857", "wer": "42.06", "wps": "76.7", "ups": "0.61", "wpb": "126.4", "bsz": "8", "num_updates": "8000", "lr": "0.000203", "gnorm": "4.839", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "20759"} [2024-06-26 18:32:41,839][train_inner][INFO] - {"epoch": 1, "update": 0.272, "loss": "2.786", "ntokens": "128.025", "acc_total": "128.025", "n_correct": "74.775", "wer_total": "128.025", "n_error": "53.165", "ppl": "6.9", "accuracy": "58.407", "wer": "41.527", "wps": "77.8", "ups": "0.61", "wpb": "128", "bsz": "8", "num_updates": "8200", "lr": "0.00020795", "gnorm": "4.74", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "21089"} [2024-06-26 18:38:10,979][train_inner][INFO] - {"epoch": 1, "update": 0.278, "loss": "2.789", "ntokens": "127.635", "acc_total": "127.635", "n_correct": "74.665", "wer_total": "127.635", "n_error": "52.85", "ppl": "6.91", "accuracy": "58.499", "wer": "41.407", "wps": "77.6", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "8400", "lr": "0.0002129", "gnorm": "4.634", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "21418"} [2024-06-26 18:43:40,203][train_inner][INFO] - {"epoch": 1, "update": 0.285, "loss": "2.719", "ntokens": "127.51", "acc_total": "127.51", "n_correct": "75.335", "wer_total": "127.51", "n_error": "52.095", "ppl": "6.58", "accuracy": "59.082", "wer": "40.856", "wps": "77.5", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "8600", "lr": "0.00021785", "gnorm": "4.524", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "21747"} [2024-06-26 18:49:09,290][train_inner][INFO] - {"epoch": 1, "update": 0.292, "loss": "2.671", "ntokens": "126.85", "acc_total": "126.85", "n_correct": "75.415", "wer_total": "126.85", "n_error": "51.36", "ppl": "6.37", "accuracy": "59.452", "wer": "40.489", "wps": "77.1", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "8800", "lr": "0.0002228", "gnorm": "4.703", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "22076"} [2024-06-26 18:54:38,522][train_inner][INFO] - {"epoch": 1, "update": 0.298, "loss": "2.771", "ntokens": "126.915", "acc_total": "126.915", "n_correct": "74.29", "wer_total": "126.915", "n_error": "52.5", "ppl": "6.83", "accuracy": "58.535", "wer": "41.366", "wps": "77.1", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "9000", "lr": "0.00022775", "gnorm": "4.498", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "22405"} [2024-06-26 19:00:07,619][train_inner][INFO] - {"epoch": 1, "update": 0.305, "loss": "2.674", "ntokens": "126.23", "acc_total": "126.23", "n_correct": "74.905", "wer_total": "126.23", "n_error": "51.215", "ppl": "6.38", "accuracy": "59.34", "wer": "40.573", "wps": "76.7", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "9200", "lr": "0.0002327", "gnorm": "4.471", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "22734"} [2024-06-26 19:05:36,923][train_inner][INFO] - {"epoch": 1, "update": 0.312, "loss": "2.631", "ntokens": "126.885", "acc_total": "126.885", "n_correct": "75.935", "wer_total": "126.885", "n_error": "50.8", "ppl": "6.19", "accuracy": "59.846", "wer": "40.036", "wps": "77.1", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "9400", "lr": "0.00023765", "gnorm": "4.445", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "23064"} [2024-06-26 19:11:06,558][train_inner][INFO] - {"epoch": 1, "update": 0.318, "loss": "2.634", "ntokens": "127.63", "acc_total": "127.63", "n_correct": "76.87", "wer_total": "127.63", "n_error": "50.64", "ppl": "6.21", "accuracy": "60.229", "wer": "39.677", "wps": "77.4", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "9600", "lr": "0.0002426", "gnorm": "4.364", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "23393"} [2024-06-26 19:16:36,094][train_inner][INFO] - {"epoch": 1, "update": 0.325, "loss": "2.57", "ntokens": "127.14", "acc_total": "127.14", "n_correct": "77.085", "wer_total": "127.14", "n_error": "49.935", "ppl": "5.94", "accuracy": "60.63", "wer": "39.276", "wps": "77.2", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "9800", "lr": "0.00024755", "gnorm": "4.233", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "23723"} [2024-06-26 19:22:05,583][train_inner][INFO] - {"epoch": 1, "update": 0.331, "loss": "2.582", "ntokens": "126.49", "acc_total": "126.49", "n_correct": "76.54", "wer_total": "126.49", "n_error": "49.855", "ppl": "5.99", "accuracy": "60.511", "wer": "39.414", "wps": "76.8", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "10000", "lr": "0.0002525", "gnorm": "4.393", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "24052"} [2024-06-26 19:22:05,584][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-26 20:04:02,552][valid][INFO] - {"epoch": 1, "valid_loss": "2.275", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "11.6965", "valid_wer_total": "18.1585", "valid_n_error": "6.44977", "valid_ppl": "4.84", "valid_accuracy": "64.413", "valid_wer": "35.519", "valid_wps": "173.1", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "10000", "valid_best_accuracy": "64.413"} [2024-06-26 20:04:02,553][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 10000 updates [2024-06-26 20:04:02,553][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_10000.pt [2024-06-26 20:04:05,865][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_10000.pt [2024-06-26 20:04:11,249][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_10000.pt (epoch 1 @ 10000 updates, score 64.413) (writing took 8.695929349167272 seconds) [2024-06-26 20:09:40,262][train_inner][INFO] - {"epoch": 1, "update": 0.338, "loss": "2.497", "ntokens": "126.675", "acc_total": "126.675", "n_correct": "77.6", "wer_total": "126.675", "n_error": "48.975", "ppl": "5.65", "accuracy": "61.259", "wer": "38.662", "wps": "8.9", "ups": "0.07", "wpb": "126.7", "bsz": "8", "num_updates": "10200", "lr": "0.00025745", "gnorm": "4.254", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "26907"} [2024-06-26 20:15:09,616][train_inner][INFO] - {"epoch": 1, "update": 0.345, "loss": "2.501", "ntokens": "127.34", "acc_total": "127.34", "n_correct": "78", "wer_total": "127.34", "n_error": "49.265", "ppl": "5.66", "accuracy": "61.253", "wer": "38.688", "wps": "77.3", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "10400", "lr": "0.0002624", "gnorm": "4.152", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "27236"} [2024-06-26 20:20:38,987][train_inner][INFO] - {"epoch": 1, "update": 0.351, "loss": "2.464", "ntokens": "126.81", "acc_total": "126.81", "n_correct": "77.91", "wer_total": "126.81", "n_error": "48.795", "ppl": "5.52", "accuracy": "61.438", "wer": "38.479", "wps": "77", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "10600", "lr": "0.00026735", "gnorm": "4.266", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "27566"} [2024-06-26 20:26:08,255][train_inner][INFO] - {"epoch": 1, "update": 0.358, "loss": "2.433", "ntokens": "127.225", "acc_total": "127.225", "n_correct": "78.45", "wer_total": "127.225", "n_error": "48.64", "ppl": "5.4", "accuracy": "61.662", "wer": "38.231", "wps": "77.3", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "10800", "lr": "0.0002723", "gnorm": "4.125", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "27895"} [2024-06-26 20:31:37,661][train_inner][INFO] - {"epoch": 1, "update": 0.365, "loss": "2.391", "ntokens": "127.14", "acc_total": "127.14", "n_correct": "79.48", "wer_total": "127.14", "n_error": "47.61", "ppl": "5.25", "accuracy": "62.514", "wer": "37.447", "wps": "77.2", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "11000", "lr": "0.00027725", "gnorm": "4.096", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "28224"} [2024-06-26 20:37:07,021][train_inner][INFO] - {"epoch": 1, "update": 0.371, "loss": "2.42", "ntokens": "127.605", "acc_total": "127.605", "n_correct": "79.295", "wer_total": "127.605", "n_error": "48.24", "ppl": "5.35", "accuracy": "62.141", "wer": "37.804", "wps": "77.5", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "11200", "lr": "0.0002822", "gnorm": "4.176", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "28554"} [2024-06-26 20:42:36,516][train_inner][INFO] - {"epoch": 1, "update": 0.378, "loss": "2.412", "ntokens": "127.07", "acc_total": "127.07", "n_correct": "78.775", "wer_total": "127.07", "n_error": "48.18", "ppl": "5.32", "accuracy": "61.993", "wer": "37.916", "wps": "77.1", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "11400", "lr": "0.00028715", "gnorm": "4.175", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "28883"} [2024-06-26 20:48:05,663][train_inner][INFO] - {"epoch": 1, "update": 0.385, "loss": "2.281", "ntokens": "127.245", "acc_total": "127.245", "n_correct": "80.56", "wer_total": "127.245", "n_error": "46.605", "ppl": "4.86", "accuracy": "63.311", "wer": "36.626", "wps": "77.3", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "11600", "lr": "0.0002921", "gnorm": "4.029", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "29212"} [2024-06-26 20:53:35,050][train_inner][INFO] - {"epoch": 1, "update": 0.391, "loss": "2.379", "ntokens": "126.955", "acc_total": "126.955", "n_correct": "79.945", "wer_total": "126.955", "n_error": "46.945", "ppl": "5.2", "accuracy": "62.971", "wer": "36.978", "wps": "77.1", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "11800", "lr": "0.00029705", "gnorm": "4.168", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "29542"} [2024-06-26 20:59:04,232][train_inner][INFO] - {"epoch": 1, "update": 0.398, "loss": "2.265", "ntokens": "126.46", "acc_total": "126.46", "n_correct": "80.3", "wer_total": "126.46", "n_error": "46.05", "ppl": "4.81", "accuracy": "63.498", "wer": "36.415", "wps": "76.8", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "12000", "lr": "0.000302", "gnorm": "4.041", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "29871"} [2024-06-26 21:04:33,708][train_inner][INFO] - {"epoch": 1, "update": 0.404, "loss": "2.33", "ntokens": "126.88", "acc_total": "126.88", "n_correct": "80.405", "wer_total": "126.88", "n_error": "46.395", "ppl": "5.03", "accuracy": "63.371", "wer": "36.566", "wps": "77", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "12200", "lr": "0.00030695", "gnorm": "4.109", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "30200"} [2024-06-26 21:07:21,510][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 4096.0 [2024-06-26 21:10:04,553][train_inner][INFO] - {"epoch": 1, "update": 0.411, "loss": "2.338", "ntokens": "125.71", "acc_total": "125.71", "n_correct": "79.19", "wer_total": "125.71", "n_error": "46.425", "ppl": "5.06", "accuracy": "62.994", "wer": "36.93", "wps": "76", "ups": "0.6", "wpb": "125.7", "bsz": "8", "num_updates": "12400", "lr": "0.0003119", "gnorm": "4.035", "loss_scale": "4096", "train_wall": "330", "gb_free": "7.1", "wall": "30531"} [2024-06-26 21:12:49,209][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-26 21:54:47,459][valid][INFO] - {"epoch": 1, "valid_loss": "2.062", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "12.1298", "valid_wer_total": "18.1585", "valid_n_error": "6.01872", "valid_ppl": "4.18", "valid_accuracy": "66.8", "valid_wer": "33.145", "valid_wps": "173", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "12500", "valid_best_accuracy": "66.8"} [2024-06-26 21:54:47,459][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 12500 updates [2024-06-26 21:54:47,460][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_12500.pt [2024-06-26 21:54:50,796][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_12500.pt [2024-06-26 21:54:56,267][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_12500.pt (epoch 1 @ 12500 updates, score 66.8) (writing took 8.807579348096624 seconds) [2024-06-26 21:57:40,523][train_inner][INFO] - {"epoch": 1, "update": 0.418, "loss": "2.315", "ntokens": "127.035", "acc_total": "127.035", "n_correct": "79.985", "wer_total": "127.035", "n_error": "46.98", "ppl": "4.98", "accuracy": "62.963", "wer": "36.982", "wps": "8.9", "ups": "0.07", "wpb": "127", "bsz": "8", "num_updates": "12600", "lr": "0.00031685", "gnorm": "3.968", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "33387"} [2024-06-26 22:03:09,738][train_inner][INFO] - {"epoch": 1, "update": 0.424, "loss": "2.245", "ntokens": "126.515", "acc_total": "126.515", "n_correct": "81.15", "wer_total": "126.515", "n_error": "45.295", "ppl": "4.74", "accuracy": "64.143", "wer": "35.802", "wps": "76.9", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "12800", "lr": "0.0003218", "gnorm": "3.88", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "33717"} [2024-06-26 22:08:38,992][train_inner][INFO] - {"epoch": 1, "update": 0.431, "loss": "2.298", "ntokens": "126.155", "acc_total": "126.155", "n_correct": "79.8", "wer_total": "126.155", "n_error": "46.275", "ppl": "4.92", "accuracy": "63.256", "wer": "36.681", "wps": "76.6", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "13000", "lr": "0.00032675", "gnorm": "3.892", "loss_scale": "4096", "train_wall": "329", "gb_free": "7.1", "wall": "34046"} [2024-06-26 22:14:08,045][train_inner][INFO] - {"epoch": 1, "update": 0.438, "loss": "2.284", "ntokens": "127.39", "acc_total": "127.39", "n_correct": "81.4", "wer_total": "127.39", "n_error": "45.93", "ppl": "4.87", "accuracy": "63.898", "wer": "36.055", "wps": "77.4", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "13200", "lr": "0.0003317", "gnorm": "3.972", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "34375"} [2024-06-26 22:19:37,203][train_inner][INFO] - {"epoch": 1, "update": 0.444, "loss": "2.258", "ntokens": "127.32", "acc_total": "127.32", "n_correct": "81.1", "wer_total": "127.32", "n_error": "46.135", "ppl": "4.78", "accuracy": "63.698", "wer": "36.235", "wps": "77.4", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "13400", "lr": "0.00033665", "gnorm": "3.87", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "34704"} [2024-06-26 22:25:06,192][train_inner][INFO] - {"epoch": 1, "update": 0.451, "loss": "2.303", "ntokens": "126.25", "acc_total": "126.25", "n_correct": "79.98", "wer_total": "126.25", "n_error": "46.215", "ppl": "4.94", "accuracy": "63.35", "wer": "36.606", "wps": "76.8", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "13600", "lr": "0.0003416", "gnorm": "3.936", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "35033"} [2024-06-26 22:30:35,287][train_inner][INFO] - {"epoch": 1, "update": 0.457, "loss": "2.237", "ntokens": "127.5", "acc_total": "127.5", "n_correct": "81.6", "wer_total": "127.5", "n_error": "45.835", "ppl": "4.71", "accuracy": "64", "wer": "35.949", "wps": "77.5", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "13800", "lr": "0.00034655", "gnorm": "3.975", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "35362"} [2024-06-26 22:33:57,690][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0 [2024-06-26 22:35:23,268][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-26 22:36:07,656][train_inner][INFO] - {"epoch": 1, "update": 0.464, "loss": "2.217", "ntokens": "127.645", "acc_total": "127.645", "n_correct": "81.725", "wer_total": "127.645", "n_error": "45.805", "ppl": "4.65", "accuracy": "64.025", "wer": "35.885", "wps": "76.8", "ups": "0.6", "wpb": "127.6", "bsz": "8", "num_updates": "14000", "lr": "0.0003515", "gnorm": "3.925", "loss_scale": "1024", "train_wall": "332", "gb_free": "7.1", "wall": "35694"} [2024-06-26 22:41:36,827][train_inner][INFO] - {"epoch": 1, "update": 0.471, "loss": "2.246", "ntokens": "127.135", "acc_total": "127.135", "n_correct": "81.485", "wer_total": "127.135", "n_error": "45.59", "ppl": "4.74", "accuracy": "64.093", "wer": "35.86", "wps": "77.2", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "14200", "lr": "0.00035645", "gnorm": "4.027", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "36024"} [2024-06-26 22:47:05,930][train_inner][INFO] - {"epoch": 1, "update": 0.477, "loss": "2.259", "ntokens": "126.25", "acc_total": "126.25", "n_correct": "80.575", "wer_total": "126.25", "n_error": "45.575", "ppl": "4.79", "accuracy": "63.822", "wer": "36.099", "wps": "76.7", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "14400", "lr": "0.0003614", "gnorm": "4.082", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "36353"} [2024-06-26 22:52:34,977][train_inner][INFO] - {"epoch": 1, "update": 0.484, "loss": "2.214", "ntokens": "127.13", "acc_total": "127.13", "n_correct": "81.55", "wer_total": "127.13", "n_error": "45.515", "ppl": "4.64", "accuracy": "64.147", "wer": "35.802", "wps": "77.3", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "14600", "lr": "0.00036635", "gnorm": "4.008", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "36682"} [2024-06-26 22:58:04,101][train_inner][INFO] - {"epoch": 1, "update": 0.491, "loss": "2.172", "ntokens": "127.28", "acc_total": "127.28", "n_correct": "82.85", "wer_total": "127.28", "n_error": "44.385", "ppl": "4.51", "accuracy": "65.093", "wer": "34.872", "wps": "77.3", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "14800", "lr": "0.0003713", "gnorm": "3.866", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "37011"} [2024-06-26 23:03:33,319][train_inner][INFO] - {"epoch": 1, "update": 0.497, "loss": "2.174", "ntokens": "126.2", "acc_total": "126.2", "n_correct": "81.93", "wer_total": "126.2", "n_error": "44.17", "ppl": "4.51", "accuracy": "64.921", "wer": "35", "wps": "76.7", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "15000", "lr": "0.00037625", "gnorm": "3.898", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "37340"} [2024-06-26 23:03:33,319][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-26 23:45:29,844][valid][INFO] - {"epoch": 1, "valid_loss": "1.958", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "12.3337", "valid_wer_total": "18.1585", "valid_n_error": "5.81692", "valid_ppl": "3.89", "valid_accuracy": "67.923", "valid_wer": "32.034", "valid_wps": "173.1", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "15000", "valid_best_accuracy": "67.923"} [2024-06-26 23:45:29,844][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 15000 updates [2024-06-26 23:45:29,845][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_15000.pt [2024-06-26 23:45:33,140][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_15000.pt [2024-06-26 23:45:38,620][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_15000.pt (epoch 1 @ 15000 updates, score 67.923) (writing took 8.775756380986422 seconds) [2024-06-26 23:51:07,691][train_inner][INFO] - {"epoch": 1, "update": 0.504, "loss": "2.166", "ntokens": "127.045", "acc_total": "127.045", "n_correct": "82.755", "wer_total": "127.045", "n_error": "44.22", "ppl": "4.49", "accuracy": "65.138", "wer": "34.807", "wps": "8.9", "ups": "0.07", "wpb": "127", "bsz": "8", "num_updates": "15200", "lr": "0.0003812", "gnorm": "3.911", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "40194"} [2024-06-26 23:56:36,911][train_inner][INFO] - {"epoch": 1, "update": 0.511, "loss": "2.23", "ntokens": "126.73", "acc_total": "126.73", "n_correct": "81.45", "wer_total": "126.73", "n_error": "45.175", "ppl": "4.69", "accuracy": "64.27", "wer": "35.647", "wps": "77", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "15400", "lr": "0.00038615", "gnorm": "4.023", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "40524"} [2024-06-27 00:02:07,416][train_inner][INFO] - {"epoch": 1, "update": 0.517, "loss": "2.188", "ntokens": "126.755", "acc_total": "126.755", "n_correct": "82.25", "wer_total": "126.755", "n_error": "44.43", "ppl": "4.56", "accuracy": "64.889", "wer": "35.052", "wps": "76.7", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "15600", "lr": "0.0003911", "gnorm": "4.022", "loss_scale": "1024", "train_wall": "330", "gb_free": "7.1", "wall": "40854"} [2024-06-27 00:07:40,460][train_inner][INFO] - {"epoch": 1, "update": 0.524, "loss": "2.131", "ntokens": "127.395", "acc_total": "127.395", "n_correct": "83.06", "wer_total": "127.395", "n_error": "44.26", "ppl": "4.38", "accuracy": "65.199", "wer": "34.742", "wps": "76.5", "ups": "0.6", "wpb": "127.4", "bsz": "8", "num_updates": "15800", "lr": "0.00039605", "gnorm": "3.882", "loss_scale": "1024", "train_wall": "332", "gb_free": "7.1", "wall": "41187"} [2024-06-27 00:13:18,244][train_inner][INFO] - {"epoch": 1, "update": 0.53, "loss": "2.085", "ntokens": "127.065", "acc_total": "127.065", "n_correct": "83.66", "wer_total": "127.065", "n_error": "43.355", "ppl": "4.24", "accuracy": "65.84", "wer": "34.12", "wps": "75.2", "ups": "0.59", "wpb": "127.1", "bsz": "8", "num_updates": "16000", "lr": "0.000401", "gnorm": "3.856", "loss_scale": "1024", "train_wall": "332", "gb_free": "7.1", "wall": "41525"} [2024-06-27 00:18:58,538][train_inner][INFO] - {"epoch": 1, "update": 0.537, "loss": "2.191", "ntokens": "126.25", "acc_total": "126.25", "n_correct": "82.16", "wer_total": "126.25", "n_error": "44", "ppl": "4.57", "accuracy": "65.077", "wer": "34.851", "wps": "74.2", "ups": "0.59", "wpb": "126.2", "bsz": "8", "num_updates": "16200", "lr": "0.00040595", "gnorm": "3.981", "loss_scale": "2048", "train_wall": "332", "gb_free": "7.1", "wall": "41865"} [2024-06-27 00:24:31,385][train_inner][INFO] - {"epoch": 1, "update": 0.544, "loss": "2.099", "ntokens": "126.8", "acc_total": "126.8", "n_correct": "82.78", "wer_total": "126.8", "n_error": "43.965", "ppl": "4.28", "accuracy": "65.284", "wer": "34.673", "wps": "76.2", "ups": "0.6", "wpb": "126.8", "bsz": "8", "num_updates": "16400", "lr": "0.0004109", "gnorm": "3.901", "loss_scale": "2048", "train_wall": "332", "gb_free": "7.1", "wall": "42198"} [2024-06-27 00:30:05,369][train_inner][INFO] - {"epoch": 1, "update": 0.55, "loss": "2.103", "ntokens": "126.19", "acc_total": "126.19", "n_correct": "82.92", "wer_total": "126.19", "n_error": "43.2", "ppl": "4.3", "accuracy": "65.71", "wer": "34.234", "wps": "75.6", "ups": "0.6", "wpb": "126.2", "bsz": "8", "num_updates": "16600", "lr": "0.00041585", "gnorm": "4.042", "loss_scale": "2048", "train_wall": "333", "gb_free": "7.1", "wall": "42532"} [2024-06-27 00:35:39,024][train_inner][INFO] - {"epoch": 1, "update": 0.557, "loss": "2.195", "ntokens": "126.005", "acc_total": "126.005", "n_correct": "81.555", "wer_total": "126.005", "n_error": "44.385", "ppl": "4.58", "accuracy": "64.724", "wer": "35.225", "wps": "75.5", "ups": "0.6", "wpb": "126", "bsz": "8", "num_updates": "16800", "lr": "0.0004208", "gnorm": "3.909", "loss_scale": "2048", "train_wall": "333", "gb_free": "7.1", "wall": "42866"} [2024-06-27 00:41:12,948][train_inner][INFO] - {"epoch": 1, "update": 0.564, "loss": "2.138", "ntokens": "126.335", "acc_total": "126.335", "n_correct": "82.395", "wer_total": "126.335", "n_error": "43.87", "ppl": "4.4", "accuracy": "65.219", "wer": "34.725", "wps": "75.7", "ups": "0.6", "wpb": "126.3", "bsz": "8", "num_updates": "17000", "lr": "0.00042575", "gnorm": "4.037", "loss_scale": "2048", "train_wall": "333", "gb_free": "7.1", "wall": "43200"} [2024-06-27 00:46:46,084][train_inner][INFO] - {"epoch": 1, "update": 0.57, "loss": "2.082", "ntokens": "127.325", "acc_total": "127.325", "n_correct": "83.53", "wer_total": "127.325", "n_error": "43.745", "ppl": "4.23", "accuracy": "65.604", "wer": "34.357", "wps": "76.4", "ups": "0.6", "wpb": "127.3", "bsz": "8", "num_updates": "17200", "lr": "0.0004307", "gnorm": "3.988", "loss_scale": "2048", "train_wall": "332", "gb_free": "7.1", "wall": "43533"} [2024-06-27 00:52:27,265][train_inner][INFO] - {"epoch": 1, "update": 0.577, "loss": "2.129", "ntokens": "127.335", "acc_total": "127.335", "n_correct": "84.21", "wer_total": "127.335", "n_error": "43.065", "ppl": "4.37", "accuracy": "66.133", "wer": "33.82", "wps": "74.6", "ups": "0.59", "wpb": "127.3", "bsz": "8", "num_updates": "17400", "lr": "0.00043565", "gnorm": "4.094", "loss_scale": "2048", "train_wall": "333", "gb_free": "7.1", "wall": "43874"} [2024-06-27 00:55:21,924][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 01:37:17,060][valid][INFO] - {"epoch": 1, "valid_loss": "1.848", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.4996", "valid_wer_total": "18.1585", "valid_n_error": "4.65311", "valid_ppl": "3.6", "valid_accuracy": "74.343", "valid_wer": "25.625", "valid_wps": "173.2", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "17500", "valid_best_accuracy": "74.343"} [2024-06-27 01:37:17,064][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 17500 updates [2024-06-27 01:37:17,064][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_17500.pt [2024-06-27 01:37:20,364][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_17500.pt [2024-06-27 01:37:25,977][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_17500.pt (epoch 1 @ 17500 updates, score 74.343) (writing took 8.91315940907225 seconds) [2024-06-27 01:40:12,328][train_inner][INFO] - {"epoch": 1, "update": 0.583, "loss": "2.193", "ntokens": "125.52", "acc_total": "125.52", "n_correct": "83.12", "wer_total": "125.52", "n_error": "42.325", "ppl": "4.57", "accuracy": "66.221", "wer": "33.72", "wps": "8.8", "ups": "0.07", "wpb": "125.5", "bsz": "8", "num_updates": "17600", "lr": "0.0004406", "gnorm": "4.076", "loss_scale": "2048", "train_wall": "332", "gb_free": "7.1", "wall": "46739"} [2024-06-27 01:45:45,505][train_inner][INFO] - {"epoch": 1, "update": 0.59, "loss": "2.113", "ntokens": "127.195", "acc_total": "127.195", "n_correct": "85.905", "wer_total": "127.195", "n_error": "41.21", "ppl": "4.33", "accuracy": "67.538", "wer": "32.399", "wps": "76.4", "ups": "0.6", "wpb": "127.2", "bsz": "8", "num_updates": "17800", "lr": "0.00044555", "gnorm": "4.127", "loss_scale": "2048", "train_wall": "332", "gb_free": "7.1", "wall": "47072"} [2024-06-27 01:51:18,675][train_inner][INFO] - {"epoch": 1, "update": 0.597, "loss": "2.102", "ntokens": "127.28", "acc_total": "127.28", "n_correct": "85.75", "wer_total": "127.28", "n_error": "41.47", "ppl": "4.29", "accuracy": "67.371", "wer": "32.582", "wps": "76.4", "ups": "0.6", "wpb": "127.3", "bsz": "8", "num_updates": "18000", "lr": "0.0004505", "gnorm": "3.959", "loss_scale": "2048", "train_wall": "332", "gb_free": "7.1", "wall": "47405"} [2024-06-27 01:54:18,622][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0 [2024-06-27 01:57:30,841][train_inner][INFO] - {"epoch": 1, "update": 0.603, "loss": "2.525", "ntokens": "126.965", "acc_total": "126.965", "n_correct": "80.92", "wer_total": "126.965", "n_error": "45.99", "ppl": "5.75", "accuracy": "63.734", "wer": "36.223", "wps": "68.2", "ups": "0.54", "wpb": "127", "bsz": "8", "num_updates": "18200", "lr": "0.00045545", "gnorm": "10.624", "loss_scale": "2048", "train_wall": "371", "gb_free": "6.5", "wall": "47778"} [2024-06-27 02:03:40,644][train_inner][INFO] - {"epoch": 1, "update": 0.61, "loss": "2.318", "ntokens": "126.68", "acc_total": "126.68", "n_correct": "80.845", "wer_total": "126.68", "n_error": "45.81", "ppl": "4.99", "accuracy": "63.818", "wer": "36.162", "wps": "68.5", "ups": "0.54", "wpb": "126.7", "bsz": "8", "num_updates": "18400", "lr": "0.0004604", "gnorm": "9.462", "loss_scale": "2048", "train_wall": "369", "gb_free": "6.5", "wall": "48147"} [2024-06-27 02:09:50,206][train_inner][INFO] - {"epoch": 1, "update": 0.617, "loss": "2.364", "ntokens": "126.215", "acc_total": "126.215", "n_correct": "81.97", "wer_total": "126.215", "n_error": "44.205", "ppl": "5.15", "accuracy": "64.945", "wer": "35.024", "wps": "68.3", "ups": "0.54", "wpb": "126.2", "bsz": "8", "num_updates": "18600", "lr": "0.00046535", "gnorm": "8.794", "loss_scale": "2048", "train_wall": "369", "gb_free": "6.5", "wall": "48517"} [2024-06-27 02:14:36,498][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-27 02:16:15,362][train_inner][INFO] - {"epoch": 1, "update": 0.623, "loss": "2.259", "ntokens": "126.255", "acc_total": "126.255", "n_correct": "81.865", "wer_total": "126.255", "n_error": "44.36", "ppl": "4.79", "accuracy": "64.841", "wer": "35.135", "wps": "65.6", "ups": "0.52", "wpb": "126.3", "bsz": "8", "num_updates": "18800", "lr": "0.0004703", "gnorm": "7.936", "loss_scale": "1024", "train_wall": "371", "gb_free": "6.5", "wall": "48902"} [2024-06-27 02:22:26,908][train_inner][INFO] - {"epoch": 1, "update": 0.63, "loss": "2.24", "ntokens": "126.93", "acc_total": "126.93", "n_correct": "85.16", "wer_total": "126.93", "n_error": "41.72", "ppl": "4.73", "accuracy": "67.092", "wer": "32.869", "wps": "68.3", "ups": "0.54", "wpb": "126.9", "bsz": "8", "num_updates": "19000", "lr": "0.00047525", "gnorm": "7.971", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "49274"} [2024-06-27 02:28:36,383][train_inner][INFO] - {"epoch": 1, "update": 0.637, "loss": "2.231", "ntokens": "126.58", "acc_total": "126.58", "n_correct": "85.83", "wer_total": "126.58", "n_error": "40.71", "ppl": "4.69", "accuracy": "67.807", "wer": "32.161", "wps": "68.5", "ups": "0.54", "wpb": "126.6", "bsz": "8", "num_updates": "19200", "lr": "0.0004802", "gnorm": "7.777", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "49643"} [2024-06-27 02:34:46,024][train_inner][INFO] - {"epoch": 1, "update": 0.643, "loss": "2.212", "ntokens": "126.58", "acc_total": "126.58", "n_correct": "87.185", "wer_total": "126.58", "n_error": "39.305", "ppl": "4.63", "accuracy": "68.877", "wer": "31.052", "wps": "68.5", "ups": "0.54", "wpb": "126.6", "bsz": "8", "num_updates": "19400", "lr": "0.00048515", "gnorm": "7.435", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "50013"} [2024-06-27 02:40:56,150][train_inner][INFO] - {"epoch": 1, "update": 0.65, "loss": "2.175", "ntokens": "125.875", "acc_total": "125.875", "n_correct": "86.175", "wer_total": "125.875", "n_error": "39.65", "ppl": "4.52", "accuracy": "68.461", "wer": "31.5", "wps": "68", "ups": "0.54", "wpb": "125.9", "bsz": "8", "num_updates": "19600", "lr": "0.0004901", "gnorm": "7.186", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "50383"} [2024-06-27 02:47:06,741][train_inner][INFO] - {"epoch": 1, "update": 0.656, "loss": "2.214", "ntokens": "126.505", "acc_total": "126.505", "n_correct": "87.38", "wer_total": "126.505", "n_error": "39.11", "ppl": "4.64", "accuracy": "69.072", "wer": "30.916", "wps": "68.3", "ups": "0.54", "wpb": "126.5", "bsz": "8", "num_updates": "19800", "lr": "0.00049505", "gnorm": "7", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "50754"} [2024-06-27 02:53:17,042][train_inner][INFO] - {"epoch": 1, "update": 0.663, "loss": "2.124", "ntokens": "126.515", "acc_total": "126.515", "n_correct": "88.945", "wer_total": "126.515", "n_error": "37.53", "ppl": "4.36", "accuracy": "70.304", "wer": "29.664", "wps": "68.3", "ups": "0.54", "wpb": "126.5", "bsz": "8", "num_updates": "20000", "lr": "0.0005", "gnorm": "6.578", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "51124"} [2024-06-27 02:53:17,043][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 03:35:18,415][valid][INFO] - {"epoch": 1, "valid_loss": "1.879", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.0835", "valid_wer_total": "18.1585", "valid_n_error": "5.06903", "valid_ppl": "3.68", "valid_accuracy": "72.051", "valid_wer": "27.915", "valid_wps": "172.8", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "20000", "valid_best_accuracy": "74.343"} [2024-06-27 03:35:18,416][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 20000 updates [2024-06-27 03:35:18,416][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_20000.pt [2024-06-27 03:35:21,706][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_20000.pt [2024-06-27 03:35:23,978][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_20000.pt (epoch 1 @ 20000 updates, score 72.051) (writing took 5.561788168968633 seconds) [2024-06-27 03:41:38,994][train_inner][INFO] - {"epoch": 1, "update": 0.67, "loss": "2.205", "ntokens": "126.4", "acc_total": "126.4", "n_correct": "88.655", "wer_total": "126.4", "n_error": "37.695", "ppl": "4.61", "accuracy": "70.138", "wer": "29.822", "wps": "8.7", "ups": "0.07", "wpb": "126.4", "bsz": "8", "num_updates": "20200", "lr": "0.000492566", "gnorm": "6.634", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "54026"} [2024-06-27 03:47:51,164][train_inner][INFO] - {"epoch": 1, "update": 0.676, "loss": "2.113", "ntokens": "127.165", "acc_total": "127.165", "n_correct": "90.325", "wer_total": "127.165", "n_error": "36.795", "ppl": "4.32", "accuracy": "71.03", "wer": "28.935", "wps": "68.3", "ups": "0.54", "wpb": "127.2", "bsz": "8", "num_updates": "20400", "lr": "0.000485243", "gnorm": "6.208", "loss_scale": "1024", "train_wall": "371", "gb_free": "6.5", "wall": "54398"} [2024-06-27 03:54:01,960][train_inner][INFO] - {"epoch": 1, "update": 0.683, "loss": "2.061", "ntokens": "127.45", "acc_total": "127.45", "n_correct": "91.34", "wer_total": "127.45", "n_error": "36.075", "ppl": "4.17", "accuracy": "71.667", "wer": "28.305", "wps": "68.7", "ups": "0.54", "wpb": "127.5", "bsz": "8", "num_updates": "20600", "lr": "0.000478029", "gnorm": "6.136", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "54769"} [2024-06-27 04:00:11,841][train_inner][INFO] - {"epoch": 1, "update": 0.69, "loss": "2.028", "ntokens": "127.73", "acc_total": "127.73", "n_correct": "91.995", "wer_total": "127.73", "n_error": "35.715", "ppl": "4.08", "accuracy": "72.023", "wer": "27.961", "wps": "69.1", "ups": "0.54", "wpb": "127.7", "bsz": "8", "num_updates": "20800", "lr": "0.000470922", "gnorm": "5.915", "loss_scale": "2048", "train_wall": "369", "gb_free": "6.5", "wall": "55139"} [2024-06-27 04:02:14,344][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-27 04:06:24,716][train_inner][INFO] - {"epoch": 1, "update": 0.696, "loss": "2.062", "ntokens": "126.54", "acc_total": "126.54", "n_correct": "90.5", "wer_total": "126.54", "n_error": "36", "ppl": "4.18", "accuracy": "71.519", "wer": "28.45", "wps": "67.9", "ups": "0.54", "wpb": "126.5", "bsz": "8", "num_updates": "21000", "lr": "0.000463921", "gnorm": "6.179", "loss_scale": "1024", "train_wall": "372", "gb_free": "6.5", "wall": "55511"} [2024-06-27 04:12:34,583][train_inner][INFO] - {"epoch": 1, "update": 0.703, "loss": "1.974", "ntokens": "127.105", "acc_total": "127.105", "n_correct": "91.695", "wer_total": "127.105", "n_error": "35.345", "ppl": "3.93", "accuracy": "72.141", "wer": "27.808", "wps": "68.7", "ups": "0.54", "wpb": "127.1", "bsz": "8", "num_updates": "21200", "lr": "0.000457024", "gnorm": "5.7", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "55881"} [2024-06-27 04:18:44,657][train_inner][INFO] - {"epoch": 1, "update": 0.71, "loss": "1.998", "ntokens": "126.11", "acc_total": "126.11", "n_correct": "91.195", "wer_total": "126.11", "n_error": "34.875", "ppl": "4", "accuracy": "72.314", "wer": "27.654", "wps": "68.2", "ups": "0.54", "wpb": "126.1", "bsz": "8", "num_updates": "21400", "lr": "0.00045023", "gnorm": "5.871", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "56251"} [2024-06-27 04:24:55,318][train_inner][INFO] - {"epoch": 1, "update": 0.716, "loss": "1.944", "ntokens": "126.6", "acc_total": "126.6", "n_correct": "92.435", "wer_total": "126.6", "n_error": "34.125", "ppl": "3.85", "accuracy": "73.013", "wer": "26.955", "wps": "68.3", "ups": "0.54", "wpb": "126.6", "bsz": "8", "num_updates": "21600", "lr": "0.000443536", "gnorm": "5.429", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "56622"} [2024-06-27 04:31:10,767][train_inner][INFO] - {"epoch": 1, "update": 0.723, "loss": "1.923", "ntokens": "127.72", "acc_total": "127.72", "n_correct": "90.915", "wer_total": "127.72", "n_error": "36.74", "ppl": "3.79", "accuracy": "71.183", "wer": "28.766", "wps": "68", "ups": "0.53", "wpb": "127.7", "bsz": "8", "num_updates": "21800", "lr": "0.000436942", "gnorm": "5.549", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "56998"} [2024-06-27 04:37:25,375][train_inner][INFO] - {"epoch": 1, "update": 0.729, "loss": "1.917", "ntokens": "126.96", "acc_total": "126.96", "n_correct": "92.195", "wer_total": "126.96", "n_error": "34.735", "ppl": "3.78", "accuracy": "72.617", "wer": "27.359", "wps": "67.8", "ups": "0.53", "wpb": "127", "bsz": "8", "num_updates": "22000", "lr": "0.000430446", "gnorm": "5.689", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "57372"} [2024-06-27 04:43:34,319][train_inner][INFO] - {"epoch": 1, "update": 0.736, "loss": "1.9", "ntokens": "127.235", "acc_total": "127.235", "n_correct": "93.35", "wer_total": "127.235", "n_error": "33.865", "ppl": "3.73", "accuracy": "73.368", "wer": "26.616", "wps": "69", "ups": "0.54", "wpb": "127.2", "bsz": "8", "num_updates": "22200", "lr": "0.000424046", "gnorm": "5.226", "loss_scale": "1024", "train_wall": "368", "gb_free": "6.5", "wall": "57741"} [2024-06-27 04:49:43,918][train_inner][INFO] - {"epoch": 1, "update": 0.743, "loss": "1.909", "ntokens": "125.87", "acc_total": "125.87", "n_correct": "92.81", "wer_total": "125.87", "n_error": "33.04", "ppl": "3.75", "accuracy": "73.735", "wer": "26.249", "wps": "68.1", "ups": "0.54", "wpb": "125.9", "bsz": "8", "num_updates": "22400", "lr": "0.000417742", "gnorm": "5.328", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "58111"} [2024-06-27 04:52:48,710][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 05:34:50,373][valid][INFO] - {"epoch": 1, "valid_loss": "1.536", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.687", "valid_wer_total": "18.1585", "valid_n_error": "4.46699", "valid_ppl": "2.9", "valid_accuracy": "75.375", "valid_wer": "24.6", "valid_wps": "172.8", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "22500", "valid_best_accuracy": "75.375"} [2024-06-27 05:34:50,374][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 22500 updates [2024-06-27 05:34:50,374][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_22500.pt [2024-06-27 05:34:53,679][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_22500.pt [2024-06-27 05:34:59,161][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_22500.pt (epoch 1 @ 22500 updates, score 75.375) (writing took 8.787058877991512 seconds) [2024-06-27 05:38:03,478][train_inner][INFO] - {"epoch": 1, "update": 0.749, "loss": "1.822", "ntokens": "127.18", "acc_total": "127.18", "n_correct": "95.34", "wer_total": "127.18", "n_error": "31.81", "ppl": "3.54", "accuracy": "74.965", "wer": "25.012", "wps": "8.8", "ups": "0.07", "wpb": "127.2", "bsz": "8", "num_updates": "22600", "lr": "0.000411531", "gnorm": "5.077", "loss_scale": "1024", "train_wall": "368", "gb_free": "6.5", "wall": "61010"} [2024-06-27 05:44:16,000][train_inner][INFO] - {"epoch": 1, "update": 0.756, "loss": "1.795", "ntokens": "126.555", "acc_total": "126.555", "n_correct": "94.93", "wer_total": "126.555", "n_error": "31.595", "ppl": "3.47", "accuracy": "75.011", "wer": "24.965", "wps": "67.9", "ups": "0.54", "wpb": "126.6", "bsz": "8", "num_updates": "22800", "lr": "0.000405413", "gnorm": "5.018", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "61383"} [2024-06-27 05:50:34,807][train_inner][INFO] - {"epoch": 1, "update": 0.763, "loss": "1.848", "ntokens": "126.82", "acc_total": "126.82", "n_correct": "94.9", "wer_total": "126.82", "n_error": "31.88", "ppl": "3.6", "accuracy": "74.83", "wer": "25.138", "wps": "67", "ups": "0.53", "wpb": "126.8", "bsz": "8", "num_updates": "23000", "lr": "0.000399386", "gnorm": "5.288", "loss_scale": "2048", "train_wall": "370", "gb_free": "6.5", "wall": "61762"} [2024-06-27 05:53:10,286][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-27 05:56:47,321][train_inner][INFO] - {"epoch": 1, "update": 0.769, "loss": "1.799", "ntokens": "127.6", "acc_total": "127.6", "n_correct": "96.33", "wer_total": "127.6", "n_error": "31.245", "ppl": "3.48", "accuracy": "75.494", "wer": "24.487", "wps": "68.5", "ups": "0.54", "wpb": "127.6", "bsz": "8", "num_updates": "23200", "lr": "0.000393448", "gnorm": "4.965", "loss_scale": "1024", "train_wall": "372", "gb_free": "6.5", "wall": "62134"} [2024-06-27 06:02:57,316][train_inner][INFO] - {"epoch": 1, "update": 0.776, "loss": "1.744", "ntokens": "127.45", "acc_total": "127.45", "n_correct": "96.365", "wer_total": "127.45", "n_error": "31.04", "ppl": "3.35", "accuracy": "75.61", "wer": "24.355", "wps": "68.9", "ups": "0.54", "wpb": "127.5", "bsz": "8", "num_updates": "23400", "lr": "0.000387599", "gnorm": "4.941", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "62504"} [2024-06-27 06:09:07,141][train_inner][INFO] - {"epoch": 1, "update": 0.782, "loss": "1.73", "ntokens": "126.73", "acc_total": "126.73", "n_correct": "95", "wer_total": "126.73", "n_error": "31.71", "ppl": "3.32", "accuracy": "74.963", "wer": "25.022", "wps": "68.5", "ups": "0.54", "wpb": "126.7", "bsz": "8", "num_updates": "23600", "lr": "0.000381836", "gnorm": "4.823", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "62874"} [2024-06-27 06:15:17,030][train_inner][INFO] - {"epoch": 1, "update": 0.789, "loss": "1.711", "ntokens": "125.56", "acc_total": "125.56", "n_correct": "92.38", "wer_total": "125.56", "n_error": "33.17", "ppl": "3.27", "accuracy": "73.574", "wer": "26.418", "wps": "67.9", "ups": "0.54", "wpb": "125.6", "bsz": "8", "num_updates": "23800", "lr": "0.00037616", "gnorm": "4.832", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "63244"} [2024-06-27 06:21:27,205][train_inner][INFO] - {"epoch": 1, "update": 0.796, "loss": "1.695", "ntokens": "126.46", "acc_total": "126.46", "n_correct": "94.955", "wer_total": "126.46", "n_error": "31.49", "ppl": "3.24", "accuracy": "75.087", "wer": "24.901", "wps": "68.3", "ups": "0.54", "wpb": "126.5", "bsz": "8", "num_updates": "24000", "lr": "0.000370567", "gnorm": "4.627", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "63614"} [2024-06-27 06:27:36,605][train_inner][INFO] - {"epoch": 1, "update": 0.802, "loss": "1.716", "ntokens": "127.435", "acc_total": "127.435", "n_correct": "96", "wer_total": "127.435", "n_error": "31.43", "ppl": "3.29", "accuracy": "75.333", "wer": "24.664", "wps": "69", "ups": "0.54", "wpb": "127.4", "bsz": "8", "num_updates": "24200", "lr": "0.000365058", "gnorm": "4.761", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "63983"} [2024-06-27 06:33:48,968][train_inner][INFO] - {"epoch": 1, "update": 0.809, "loss": "1.681", "ntokens": "126.655", "acc_total": "126.655", "n_correct": "97.24", "wer_total": "126.655", "n_error": "29.37", "ppl": "3.21", "accuracy": "76.775", "wer": "23.189", "wps": "68", "ups": "0.54", "wpb": "126.7", "bsz": "8", "num_updates": "24400", "lr": "0.000359631", "gnorm": "4.996", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "64356"} [2024-06-27 06:40:10,008][train_inner][INFO] - {"epoch": 1, "update": 0.816, "loss": "1.652", "ntokens": "127.26", "acc_total": "127.26", "n_correct": "98.435", "wer_total": "127.26", "n_error": "28.81", "ppl": "3.14", "accuracy": "77.35", "wer": "22.639", "wps": "66.8", "ups": "0.52", "wpb": "127.3", "bsz": "8", "num_updates": "24600", "lr": "0.000354284", "gnorm": "4.558", "loss_scale": "1024", "train_wall": "371", "gb_free": "6.5", "wall": "64737"} [2024-06-27 06:46:19,523][train_inner][INFO] - {"epoch": 1, "update": 0.822, "loss": "1.682", "ntokens": "126.065", "acc_total": "126.065", "n_correct": "96.265", "wer_total": "126.065", "n_error": "29.785", "ppl": "3.21", "accuracy": "76.361", "wer": "23.627", "wps": "68.2", "ups": "0.54", "wpb": "126.1", "bsz": "8", "num_updates": "24800", "lr": "0.000349017", "gnorm": "4.831", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "65106"} [2024-06-27 06:52:29,001][train_inner][INFO] - {"epoch": 1, "update": 0.829, "loss": "1.61", "ntokens": "127.985", "acc_total": "127.985", "n_correct": "98.455", "wer_total": "127.985", "n_error": "29.515", "ppl": "3.05", "accuracy": "76.927", "wer": "23.061", "wps": "69.3", "ups": "0.54", "wpb": "128", "bsz": "8", "num_updates": "25000", "lr": "0.000343828", "gnorm": "4.478", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "65476"} [2024-06-27 06:52:29,001][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 07:34:26,746][valid][INFO] - {"epoch": 1, "valid_loss": "1.403", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.8943", "valid_wer_total": "18.1585", "valid_n_error": "4.2599", "valid_ppl": "2.65", "valid_accuracy": "76.517", "valid_wer": "23.46", "valid_wps": "173", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "25000", "valid_best_accuracy": "76.517"} [2024-06-27 07:34:26,746][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 25000 updates [2024-06-27 07:34:26,747][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_25000.pt [2024-06-27 07:34:30,048][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_25000.pt [2024-06-27 07:34:40,986][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_25000.pt (epoch 1 @ 25000 updates, score 76.517) (writing took 14.239499802934006 seconds) [2024-06-27 07:40:51,229][train_inner][INFO] - {"epoch": 1, "update": 0.836, "loss": "1.586", "ntokens": "127.5", "acc_total": "127.5", "n_correct": "98.855", "wer_total": "127.5", "n_error": "28.64", "ppl": "3", "accuracy": "77.533", "wer": "22.463", "wps": "8.8", "ups": "0.07", "wpb": "127.5", "bsz": "8", "num_updates": "25200", "lr": "0.000338716", "gnorm": "4.653", "loss_scale": "2048", "train_wall": "369", "gb_free": "6.5", "wall": "68378"} [2024-06-27 07:41:41,072][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-27 07:47:03,231][train_inner][INFO] - {"epoch": 1, "update": 0.842, "loss": "1.617", "ntokens": "127.14", "acc_total": "127.14", "n_correct": "98.73", "wer_total": "127.14", "n_error": "28.39", "ppl": "3.07", "accuracy": "77.655", "wer": "22.33", "wps": "68.4", "ups": "0.54", "wpb": "127.1", "bsz": "8", "num_updates": "25400", "lr": "0.000333681", "gnorm": "4.543", "loss_scale": "1024", "train_wall": "371", "gb_free": "6.5", "wall": "68750"} [2024-06-27 07:53:13,006][train_inner][INFO] - {"epoch": 1, "update": 0.849, "loss": "1.624", "ntokens": "127.52", "acc_total": "127.52", "n_correct": "99.13", "wer_total": "127.52", "n_error": "28.375", "ppl": "3.08", "accuracy": "77.737", "wer": "22.251", "wps": "69", "ups": "0.54", "wpb": "127.5", "bsz": "8", "num_updates": "25600", "lr": "0.00032872", "gnorm": "4.598", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "69120"} [2024-06-27 07:59:24,724][train_inner][INFO] - {"epoch": 1, "update": 0.855, "loss": "1.586", "ntokens": "128.045", "acc_total": "128.045", "n_correct": "100.105", "wer_total": "128.045", "n_error": "27.935", "ppl": "3", "accuracy": "78.18", "wer": "21.817", "wps": "68.9", "ups": "0.54", "wpb": "128", "bsz": "8", "num_updates": "25800", "lr": "0.000323833", "gnorm": "4.308", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "69492"} [2024-06-27 08:05:40,924][train_inner][INFO] - {"epoch": 1, "update": 0.862, "loss": "1.589", "ntokens": "126.37", "acc_total": "126.37", "n_correct": "98.69", "wer_total": "126.37", "n_error": "27.655", "ppl": "3.01", "accuracy": "78.096", "wer": "21.884", "wps": "67.2", "ups": "0.53", "wpb": "126.4", "bsz": "8", "num_updates": "26000", "lr": "0.000319018", "gnorm": "4.298", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "69868"} [2024-06-27 08:11:50,965][train_inner][INFO] - {"epoch": 1, "update": 0.869, "loss": "1.523", "ntokens": "127.455", "acc_total": "127.455", "n_correct": "100.93", "wer_total": "127.455", "n_error": "26.51", "ppl": "2.87", "accuracy": "79.189", "wer": "20.799", "wps": "68.9", "ups": "0.54", "wpb": "127.5", "bsz": "8", "num_updates": "26200", "lr": "0.000314275", "gnorm": "4.421", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "70238"} [2024-06-27 08:18:01,024][train_inner][INFO] - {"epoch": 1, "update": 0.875, "loss": "1.544", "ntokens": "127.955", "acc_total": "127.955", "n_correct": "100.58", "wer_total": "127.955", "n_error": "27.365", "ppl": "2.92", "accuracy": "78.606", "wer": "21.386", "wps": "69.2", "ups": "0.54", "wpb": "128", "bsz": "8", "num_updates": "26400", "lr": "0.000309603", "gnorm": "4.311", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "70608"} [2024-06-27 08:24:11,409][train_inner][INFO] - {"epoch": 1, "update": 0.882, "loss": "1.497", "ntokens": "127.23", "acc_total": "127.23", "n_correct": "100.715", "wer_total": "127.23", "n_error": "26.495", "ppl": "2.82", "accuracy": "79.16", "wer": "20.824", "wps": "68.7", "ups": "0.54", "wpb": "127.2", "bsz": "8", "num_updates": "26600", "lr": "0.000305", "gnorm": "4.27", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "70978"} [2024-06-27 08:30:21,965][train_inner][INFO] - {"epoch": 1, "update": 0.889, "loss": "1.526", "ntokens": "127.435", "acc_total": "127.435", "n_correct": "100.775", "wer_total": "127.435", "n_error": "26.655", "ppl": "2.88", "accuracy": "79.08", "wer": "20.917", "wps": "68.8", "ups": "0.54", "wpb": "127.4", "bsz": "8", "num_updates": "26800", "lr": "0.000300466", "gnorm": "4.326", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "71349"} [2024-06-27 08:36:31,903][train_inner][INFO] - {"epoch": 1, "update": 0.895, "loss": "1.516", "ntokens": "127.57", "acc_total": "127.57", "n_correct": "100.925", "wer_total": "127.57", "n_error": "26.615", "ppl": "2.86", "accuracy": "79.113", "wer": "20.863", "wps": "69", "ups": "0.54", "wpb": "127.6", "bsz": "8", "num_updates": "27000", "lr": "0.000295999", "gnorm": "4.301", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "71719"} [2024-06-27 08:42:42,171][train_inner][INFO] - {"epoch": 1, "update": 0.902, "loss": "1.538", "ntokens": "126.875", "acc_total": "126.875", "n_correct": "99.915", "wer_total": "126.875", "n_error": "26.955", "ppl": "2.9", "accuracy": "78.751", "wer": "21.245", "wps": "68.5", "ups": "0.54", "wpb": "126.9", "bsz": "8", "num_updates": "27200", "lr": "0.000291598", "gnorm": "4.265", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "72089"} [2024-06-27 08:49:00,398][train_inner][INFO] - {"epoch": 1, "update": 0.908, "loss": "1.467", "ntokens": "127.28", "acc_total": "127.28", "n_correct": "101.01", "wer_total": "127.28", "n_error": "26.25", "ppl": "2.76", "accuracy": "79.36", "wer": "20.624", "wps": "67.3", "ups": "0.53", "wpb": "127.3", "bsz": "8", "num_updates": "27400", "lr": "0.000287263", "gnorm": "4.288", "loss_scale": "2048", "train_wall": "369", "gb_free": "6.5", "wall": "72467"} [2024-06-27 08:52:11,203][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 09:34:08,158][valid][INFO] - {"epoch": 1, "valid_loss": "1.272", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "14.7983", "valid_wer_total": "18.1585", "valid_n_error": "3.35848", "valid_ppl": "2.42", "valid_accuracy": "81.495", "valid_wer": "18.495", "valid_wps": "173.1", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "27500", "valid_best_accuracy": "81.495"} [2024-06-27 09:34:08,159][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 27500 updates [2024-06-27 09:34:08,159][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_27500.pt [2024-06-27 09:34:11,470][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_27500.pt [2024-06-27 09:34:17,000][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_27500.pt (epoch 1 @ 27500 updates, score 81.495) (writing took 8.841480945935473 seconds) [2024-06-27 09:37:21,357][train_inner][INFO] - {"epoch": 1, "update": 0.915, "loss": "1.496", "ntokens": "127.335", "acc_total": "127.335", "n_correct": "100.475", "wer_total": "127.335", "n_error": "26.84", "ppl": "2.82", "accuracy": "78.906", "wer": "21.078", "wps": "8.8", "ups": "0.07", "wpb": "127.3", "bsz": "8", "num_updates": "27600", "lr": "0.000282992", "gnorm": "4.25", "loss_scale": "2048", "train_wall": "368", "gb_free": "6.5", "wall": "75368"} [2024-06-27 09:41:42,277][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-27 09:43:33,152][train_inner][INFO] - {"epoch": 1, "update": 0.922, "loss": "1.482", "ntokens": "126.73", "acc_total": "126.73", "n_correct": "100.93", "wer_total": "126.73", "n_error": "25.785", "ppl": "2.79", "accuracy": "79.642", "wer": "20.346", "wps": "68.2", "ups": "0.54", "wpb": "126.7", "bsz": "8", "num_updates": "27800", "lr": "0.000278785", "gnorm": "4.009", "loss_scale": "1024", "train_wall": "371", "gb_free": "6.5", "wall": "75740"} [2024-06-27 09:49:42,351][train_inner][INFO] - {"epoch": 1, "update": 0.928, "loss": "1.47", "ntokens": "126.18", "acc_total": "126.18", "n_correct": "100.39", "wer_total": "126.18", "n_error": "25.77", "ppl": "2.77", "accuracy": "79.561", "wer": "20.423", "wps": "68.4", "ups": "0.54", "wpb": "126.2", "bsz": "8", "num_updates": "28000", "lr": "0.00027464", "gnorm": "4.152", "loss_scale": "1024", "train_wall": "368", "gb_free": "6.5", "wall": "76109"} [2024-06-27 09:55:52,195][train_inner][INFO] - {"epoch": 1, "update": 0.935, "loss": "1.413", "ntokens": "126.625", "acc_total": "126.625", "n_correct": "101.465", "wer_total": "126.625", "n_error": "25.155", "ppl": "2.66", "accuracy": "80.13", "wer": "19.866", "wps": "68.5", "ups": "0.54", "wpb": "126.6", "bsz": "8", "num_updates": "28200", "lr": "0.000270557", "gnorm": "3.983", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "76479"} [2024-06-27 10:02:00,895][train_inner][INFO] - {"epoch": 1, "update": 0.942, "loss": "1.412", "ntokens": "126.975", "acc_total": "126.975", "n_correct": "102.065", "wer_total": "126.975", "n_error": "24.91", "ppl": "2.66", "accuracy": "80.382", "wer": "19.618", "wps": "68.9", "ups": "0.54", "wpb": "127", "bsz": "8", "num_updates": "28400", "lr": "0.000266535", "gnorm": "4", "loss_scale": "1024", "train_wall": "368", "gb_free": "6.5", "wall": "76848"} [2024-06-27 10:08:11,666][train_inner][INFO] - {"epoch": 1, "update": 0.948, "loss": "1.418", "ntokens": "126.94", "acc_total": "126.94", "n_correct": "101.93", "wer_total": "126.94", "n_error": "24.99", "ppl": "2.67", "accuracy": "80.298", "wer": "19.686", "wps": "68.5", "ups": "0.54", "wpb": "126.9", "bsz": "8", "num_updates": "28600", "lr": "0.000262572", "gnorm": "4.03", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "77218"} [2024-06-27 10:14:22,469][train_inner][INFO] - {"epoch": 1, "update": 0.955, "loss": "1.358", "ntokens": "126.79", "acc_total": "126.79", "n_correct": "102.52", "wer_total": "126.79", "n_error": "24.25", "ppl": "2.56", "accuracy": "80.858", "wer": "19.126", "wps": "68.4", "ups": "0.54", "wpb": "126.8", "bsz": "8", "num_updates": "28800", "lr": "0.000258668", "gnorm": "3.913", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "77589"} [2024-06-27 10:20:32,038][train_inner][INFO] - {"epoch": 1, "update": 0.962, "loss": "1.365", "ntokens": "126.615", "acc_total": "126.615", "n_correct": "102.405", "wer_total": "126.615", "n_error": "24.195", "ppl": "2.58", "accuracy": "80.879", "wer": "19.109", "wps": "68.5", "ups": "0.54", "wpb": "126.6", "bsz": "8", "num_updates": "29000", "lr": "0.000254823", "gnorm": "4.028", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "77959"} [2024-06-27 10:26:42,100][train_inner][INFO] - {"epoch": 1, "update": 0.968, "loss": "1.404", "ntokens": "127.12", "acc_total": "127.12", "n_correct": "102.435", "wer_total": "127.12", "n_error": "24.655", "ppl": "2.65", "accuracy": "80.581", "wer": "19.395", "wps": "68.7", "ups": "0.54", "wpb": "127.1", "bsz": "8", "num_updates": "29200", "lr": "0.000251034", "gnorm": "3.838", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "78329"} [2024-06-27 10:32:54,335][train_inner][INFO] - {"epoch": 1, "update": 0.975, "loss": "1.369", "ntokens": "127.005", "acc_total": "127.005", "n_correct": "102.545", "wer_total": "127.005", "n_error": "24.455", "ppl": "2.58", "accuracy": "80.741", "wer": "19.255", "wps": "68.2", "ups": "0.54", "wpb": "127", "bsz": "8", "num_updates": "29400", "lr": "0.000247302", "gnorm": "3.772", "loss_scale": "1024", "train_wall": "370", "gb_free": "6.5", "wall": "78701"} [2024-06-27 10:39:05,377][train_inner][INFO] - {"epoch": 1, "update": 0.981, "loss": "1.36", "ntokens": "126.45", "acc_total": "126.45", "n_correct": "102.355", "wer_total": "126.45", "n_error": "24.075", "ppl": "2.57", "accuracy": "80.945", "wer": "19.039", "wps": "68.2", "ups": "0.54", "wpb": "126.5", "bsz": "8", "num_updates": "29600", "lr": "0.000243626", "gnorm": "3.768", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "79072"} [2024-06-27 10:45:14,759][train_inner][INFO] - {"epoch": 1, "update": 0.988, "loss": "1.398", "ntokens": "126.835", "acc_total": "126.835", "n_correct": "102.245", "wer_total": "126.835", "n_error": "24.58", "ppl": "2.64", "accuracy": "80.613", "wer": "19.38", "wps": "68.7", "ups": "0.54", "wpb": "126.8", "bsz": "8", "num_updates": "29800", "lr": "0.000240004", "gnorm": "3.914", "loss_scale": "2048", "train_wall": "369", "gb_free": "6.5", "wall": "79442"} [2024-06-27 10:46:58,592][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-27 10:51:27,333][train_inner][INFO] - {"epoch": 1, "update": 0.995, "loss": "1.339", "ntokens": "126.235", "acc_total": "126.235", "n_correct": "102.265", "wer_total": "126.235", "n_error": "23.97", "ppl": "2.53", "accuracy": "81.012", "wer": "18.988", "wps": "67.8", "ups": "0.54", "wpb": "126.2", "bsz": "8", "num_updates": "30000", "lr": "0.000236435", "gnorm": "3.97", "loss_scale": "1024", "train_wall": "372", "gb_free": "6.5", "wall": "79814"} [2024-06-27 10:51:27,334][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 11:33:25,417][valid][INFO] - {"epoch": 1, "valid_loss": "1.115", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.228", "valid_wer_total": "18.1585", "valid_n_error": "2.92939", "valid_ppl": "2.17", "valid_accuracy": "83.861", "valid_wer": "16.132", "valid_wps": "173", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "30000", "valid_best_accuracy": "83.861"} [2024-06-27 11:33:25,418][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 30000 updates [2024-06-27 11:33:25,418][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_30000.pt [2024-06-27 11:33:28,789][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_30000.pt [2024-06-27 11:33:34,110][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_30000.pt (epoch 1 @ 30000 updates, score 83.861) (writing took 8.692518908996135 seconds) [2024-06-27 11:38:28,639][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 12:20:31,222][valid][INFO] - {"epoch": 1, "valid_loss": "1.131", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.1894", "valid_wer_total": "18.1585", "valid_n_error": "2.96832", "valid_ppl": "2.19", "valid_accuracy": "83.649", "valid_wer": "16.347", "valid_wps": "172.7", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "30159", "valid_best_accuracy": "83.861"} [2024-06-27 12:20:31,222][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 30159 updates [2024-06-27 12:20:31,223][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_last.pt [2024-06-27 12:20:34,878][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_last.pt [2024-06-27 12:20:34,963][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_last.pt (epoch 1 @ 30159 updates, score 83.649) (writing took 3.7400010810233653 seconds) [2024-06-27 12:20:34,963][fairseq_cli.train][INFO] - end of epoch 1 (average epoch stats below) [2024-06-27 12:20:34,965][train][INFO] - {"epoch": 1, "train_loss": "2.712", "train_ntokens": "126.906", "train_acc_total": "126.906", "train_n_correct": "78.095", "train_wer_total": "126.906", "train_n_error": "48.7237", "train_ppl": "6.55", "train_accuracy": "61.538", "train_wer": "38.394", "train_wps": "44.9", "train_ups": "0.35", "train_wpb": "126.9", "train_bsz": "8", "train_num_updates": "30159", "train_lr": "0.000233637", "train_gnorm": "4.759", "train_loss_scale": "1024", "train_train_wall": "52095", "train_gb_free": "6.5", "train_wall": "85162"} [2024-06-27 12:20:35,061][fairseq.trainer][INFO] - begin training epoch 2 [2024-06-27 12:20:35,062][fairseq_cli.train][INFO] - Start iterating over samples [2024-06-27 12:21:49,975][train_inner][INFO] - {"epoch": 2, "update": 1.001, "loss": "1.333", "ntokens": "126.29", "acc_total": "126.29", "n_correct": "102.745", "wer_total": "126.29", "n_error": "23.535", "ppl": "2.52", "accuracy": "81.356", "wer": "18.636", "wps": "4.7", "ups": "0.04", "wpb": "126.3", "bsz": "8", "num_updates": "30200", "lr": "0.00023292", "gnorm": "3.824", "loss_scale": "1024", "train_wall": "369", "gb_free": "6.5", "wall": "85237"} [2024-06-27 12:27:55,471][train_inner][INFO] - {"epoch": 2, "update": 1.008, "loss": "1.158", "ntokens": "126.895", "acc_total": "126.895", "n_correct": "105.96", "wer_total": "126.895", "n_error": "20.935", "ppl": "2.23", "accuracy": "83.502", "wer": "16.498", "wps": "69.4", "ups": "0.55", "wpb": "126.9", "bsz": "8", "num_updates": "30400", "lr": "0.000229457", "gnorm": "3.687", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "85602"} [2024-06-27 12:34:00,955][train_inner][INFO] - {"epoch": 2, "update": 1.015, "loss": "1.214", "ntokens": "126.385", "acc_total": "126.385", "n_correct": "104.19", "wer_total": "126.385", "n_error": "22.19", "ppl": "2.32", "accuracy": "82.439", "wer": "17.557", "wps": "69.2", "ups": "0.55", "wpb": "126.4", "bsz": "8", "num_updates": "30600", "lr": "0.000226046", "gnorm": "3.684", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "85968"} [2024-06-27 12:40:06,449][train_inner][INFO] - {"epoch": 2, "update": 1.021, "loss": "1.229", "ntokens": "126.52", "acc_total": "126.52", "n_correct": "104.36", "wer_total": "126.52", "n_error": "22.15", "ppl": "2.34", "accuracy": "82.485", "wer": "17.507", "wps": "69.2", "ups": "0.55", "wpb": "126.5", "bsz": "8", "num_updates": "30800", "lr": "0.000222685", "gnorm": "3.847", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "86333"} [2024-06-27 12:46:12,080][train_inner][INFO] - {"epoch": 2, "update": 1.028, "loss": "1.232", "ntokens": "126.69", "acc_total": "126.69", "n_correct": "104.065", "wer_total": "126.69", "n_error": "22.61", "ppl": "2.35", "accuracy": "82.141", "wer": "17.847", "wps": "69.3", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "31000", "lr": "0.000219375", "gnorm": "3.726", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "86699"} [2024-06-27 12:52:17,671][train_inner][INFO] - {"epoch": 2, "update": 1.035, "loss": "1.209", "ntokens": "126.34", "acc_total": "126.34", "n_correct": "104.605", "wer_total": "126.34", "n_error": "21.73", "ppl": "2.31", "accuracy": "82.796", "wer": "17.2", "wps": "69.1", "ups": "0.55", "wpb": "126.3", "bsz": "8", "num_updates": "31200", "lr": "0.000216113", "gnorm": "3.606", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "87064"} [2024-06-27 12:58:23,341][train_inner][INFO] - {"epoch": 2, "update": 1.041, "loss": "1.242", "ntokens": "127.885", "acc_total": "127.885", "n_correct": "105.075", "wer_total": "127.885", "n_error": "22.81", "ppl": "2.36", "accuracy": "82.164", "wer": "17.836", "wps": "69.9", "ups": "0.55", "wpb": "127.9", "bsz": "8", "num_updates": "31400", "lr": "0.0002129", "gnorm": "3.642", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "87430"} [2024-06-27 13:04:29,239][train_inner][INFO] - {"epoch": 2, "update": 1.048, "loss": "1.172", "ntokens": "126.545", "acc_total": "126.545", "n_correct": "104.98", "wer_total": "126.545", "n_error": "21.545", "ppl": "2.25", "accuracy": "82.959", "wer": "17.026", "wps": "69.2", "ups": "0.55", "wpb": "126.5", "bsz": "8", "num_updates": "31600", "lr": "0.000209735", "gnorm": "3.651", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "87796"} [2024-06-27 13:10:34,713][train_inner][INFO] - {"epoch": 2, "update": 1.054, "loss": "1.199", "ntokens": "126.03", "acc_total": "126.03", "n_correct": "104.44", "wer_total": "126.03", "n_error": "21.59", "ppl": "2.3", "accuracy": "82.869", "wer": "17.131", "wps": "69", "ups": "0.55", "wpb": "126", "bsz": "8", "num_updates": "31800", "lr": "0.000206617", "gnorm": "3.827", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "88161"} [2024-06-27 13:12:33,397][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 512.0 [2024-06-27 13:16:41,867][train_inner][INFO] - {"epoch": 2, "update": 1.061, "loss": "1.204", "ntokens": "126.495", "acc_total": "126.495", "n_correct": "104.49", "wer_total": "126.495", "n_error": "22", "ppl": "2.3", "accuracy": "82.604", "wer": "17.392", "wps": "68.9", "ups": "0.54", "wpb": "126.5", "bsz": "8", "num_updates": "32000", "lr": "0.000203545", "gnorm": "3.737", "loss_scale": "512", "train_wall": "366", "gb_free": "6.5", "wall": "88529"} [2024-06-27 13:22:47,285][train_inner][INFO] - {"epoch": 2, "update": 1.068, "loss": "1.212", "ntokens": "126.89", "acc_total": "126.89", "n_correct": "105.285", "wer_total": "126.89", "n_error": "21.595", "ppl": "2.32", "accuracy": "82.973", "wer": "17.019", "wps": "69.4", "ups": "0.55", "wpb": "126.9", "bsz": "8", "num_updates": "32200", "lr": "0.000200519", "gnorm": "3.578", "loss_scale": "512", "train_wall": "365", "gb_free": "6.5", "wall": "88894"} [2024-06-27 13:28:52,542][train_inner][INFO] - {"epoch": 2, "update": 1.074, "loss": "1.124", "ntokens": "125.855", "acc_total": "125.855", "n_correct": "105.8", "wer_total": "125.855", "n_error": "20.045", "ppl": "2.18", "accuracy": "84.065", "wer": "15.927", "wps": "68.9", "ups": "0.55", "wpb": "125.9", "bsz": "8", "num_updates": "32400", "lr": "0.000197538", "gnorm": "3.484", "loss_scale": "512", "train_wall": "364", "gb_free": "6.5", "wall": "89259"} [2024-06-27 13:31:55,171][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 14:13:54,514][valid][INFO] - {"epoch": 2, "valid_loss": "1.065", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.3408", "valid_wer_total": "18.1585", "valid_n_error": "2.81663", "valid_ppl": "2.09", "valid_accuracy": "84.483", "valid_wer": "15.511", "valid_wps": "172.9", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "32500", "valid_best_accuracy": "84.483"} [2024-06-27 14:13:54,515][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 32500 updates [2024-06-27 14:13:54,515][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_32500.pt [2024-06-27 14:13:57,801][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_32500.pt [2024-06-27 14:14:03,161][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_32500.pt (epoch 2 @ 32500 updates, score 84.483) (writing took 8.645917238900438 seconds) [2024-06-27 14:17:05,411][train_inner][INFO] - {"epoch": 2, "update": 1.081, "loss": "1.206", "ntokens": "127.045", "acc_total": "127.045", "n_correct": "105.235", "wer_total": "127.045", "n_error": "21.8", "ppl": "2.31", "accuracy": "82.833", "wer": "17.159", "wps": "8.8", "ups": "0.07", "wpb": "127", "bsz": "8", "num_updates": "32600", "lr": "0.000194601", "gnorm": "3.637", "loss_scale": "512", "train_wall": "364", "gb_free": "6.5", "wall": "92152"} [2024-06-27 14:23:10,613][train_inner][INFO] - {"epoch": 2, "update": 1.088, "loss": "1.189", "ntokens": "126.005", "acc_total": "126.005", "n_correct": "104.97", "wer_total": "126.005", "n_error": "21.025", "ppl": "2.28", "accuracy": "83.306", "wer": "16.686", "wps": "69", "ups": "0.55", "wpb": "126", "bsz": "8", "num_updates": "32800", "lr": "0.000191708", "gnorm": "3.602", "loss_scale": "512", "train_wall": "364", "gb_free": "6.5", "wall": "92517"} [2024-06-27 14:29:15,493][train_inner][INFO] - {"epoch": 2, "update": 1.094, "loss": "1.119", "ntokens": "126.535", "acc_total": "126.535", "n_correct": "105.945", "wer_total": "126.535", "n_error": "20.575", "ppl": "2.17", "accuracy": "83.728", "wer": "16.26", "wps": "69.4", "ups": "0.55", "wpb": "126.5", "bsz": "8", "num_updates": "33000", "lr": "0.000188858", "gnorm": "3.555", "loss_scale": "512", "train_wall": "364", "gb_free": "6.5", "wall": "92882"} [2024-06-27 14:35:20,564][train_inner][INFO] - {"epoch": 2, "update": 1.101, "loss": "1.114", "ntokens": "126.8", "acc_total": "126.8", "n_correct": "106.53", "wer_total": "126.8", "n_error": "20.265", "ppl": "2.16", "accuracy": "84.014", "wer": "15.982", "wps": "69.5", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "33200", "lr": "0.00018605", "gnorm": "3.492", "loss_scale": "512", "train_wall": "364", "gb_free": "6.5", "wall": "93247"} [2024-06-27 14:41:25,701][train_inner][INFO] - {"epoch": 2, "update": 1.107, "loss": "1.152", "ntokens": "126.895", "acc_total": "126.895", "n_correct": "105.635", "wer_total": "126.895", "n_error": "21.26", "ppl": "2.22", "accuracy": "83.246", "wer": "16.754", "wps": "69.5", "ups": "0.55", "wpb": "126.9", "bsz": "8", "num_updates": "33400", "lr": "0.000183284", "gnorm": "3.581", "loss_scale": "512", "train_wall": "364", "gb_free": "6.5", "wall": "93612"} [2024-06-27 14:47:30,964][train_inner][INFO] - {"epoch": 2, "update": 1.114, "loss": "1.179", "ntokens": "126.64", "acc_total": "126.64", "n_correct": "105.4", "wer_total": "126.64", "n_error": "21.23", "ppl": "2.26", "accuracy": "83.228", "wer": "16.764", "wps": "69.3", "ups": "0.55", "wpb": "126.6", "bsz": "8", "num_updates": "33600", "lr": "0.000180559", "gnorm": "3.637", "loss_scale": "512", "train_wall": "365", "gb_free": "6.5", "wall": "93978"} [2024-06-27 14:53:36,011][train_inner][INFO] - {"epoch": 2, "update": 1.121, "loss": "1.125", "ntokens": "126.795", "acc_total": "126.795", "n_correct": "106.62", "wer_total": "126.795", "n_error": "20.16", "ppl": "2.18", "accuracy": "84.088", "wer": "15.9", "wps": "69.5", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "33800", "lr": "0.000177875", "gnorm": "3.415", "loss_scale": "512", "train_wall": "364", "gb_free": "6.5", "wall": "94343"} [2024-06-27 14:59:41,279][train_inner][INFO] - {"epoch": 2, "update": 1.127, "loss": "1.094", "ntokens": "126.37", "acc_total": "126.37", "n_correct": "106.46", "wer_total": "126.37", "n_error": "19.905", "ppl": "2.13", "accuracy": "84.245", "wer": "15.751", "wps": "69.2", "ups": "0.55", "wpb": "126.4", "bsz": "8", "num_updates": "34000", "lr": "0.00017523", "gnorm": "3.487", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "94708"} [2024-06-27 15:05:46,533][train_inner][INFO] - {"epoch": 2, "update": 1.134, "loss": "1.115", "ntokens": "126.38", "acc_total": "126.38", "n_correct": "105.995", "wer_total": "126.38", "n_error": "20.37", "ppl": "2.17", "accuracy": "83.87", "wer": "16.118", "wps": "69.2", "ups": "0.55", "wpb": "126.4", "bsz": "8", "num_updates": "34200", "lr": "0.000172625", "gnorm": "3.457", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "95073"} [2024-06-27 15:11:51,884][train_inner][INFO] - {"epoch": 2, "update": 1.141, "loss": "1.123", "ntokens": "126.865", "acc_total": "126.865", "n_correct": "106.665", "wer_total": "126.865", "n_error": "20.2", "ppl": "2.18", "accuracy": "84.078", "wer": "15.922", "wps": "69.4", "ups": "0.55", "wpb": "126.9", "bsz": "8", "num_updates": "34400", "lr": "0.000170059", "gnorm": "3.572", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "95439"} [2024-06-27 15:17:57,470][train_inner][INFO] - {"epoch": 2, "update": 1.147, "loss": "1.134", "ntokens": "127.455", "acc_total": "127.455", "n_correct": "106.7", "wer_total": "127.455", "n_error": "20.75", "ppl": "2.2", "accuracy": "83.716", "wer": "16.28", "wps": "69.7", "ups": "0.55", "wpb": "127.5", "bsz": "8", "num_updates": "34600", "lr": "0.000167531", "gnorm": "3.61", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "95804"} [2024-06-27 15:24:02,725][train_inner][INFO] - {"epoch": 2, "update": 1.154, "loss": "1.064", "ntokens": "125.96", "acc_total": "125.96", "n_correct": "106.71", "wer_total": "125.96", "n_error": "19.245", "ppl": "2.09", "accuracy": "84.717", "wer": "15.279", "wps": "69", "ups": "0.55", "wpb": "126", "bsz": "8", "num_updates": "34800", "lr": "0.00016504", "gnorm": "3.401", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "96170"} [2024-06-27 15:30:08,067][train_inner][INFO] - {"epoch": 2, "update": 1.16, "loss": "1.145", "ntokens": "126.175", "acc_total": "126.175", "n_correct": "105.71", "wer_total": "126.175", "n_error": "20.46", "ppl": "2.21", "accuracy": "83.78", "wer": "16.216", "wps": "69.1", "ups": "0.55", "wpb": "126.2", "bsz": "8", "num_updates": "35000", "lr": "0.000162586", "gnorm": "3.485", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "96535"} [2024-06-27 15:30:08,067][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 16:12:07,536][valid][INFO] - {"epoch": 2, "valid_loss": "0.993", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.5099", "valid_wer_total": "18.1585", "valid_n_error": "2.64727", "valid_ppl": "1.99", "valid_accuracy": "85.414", "valid_wer": "14.579", "valid_wps": "172.9", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "35000", "valid_best_accuracy": "85.414"} [2024-06-27 16:12:07,537][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 35000 updates [2024-06-27 16:12:07,537][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_35000.pt [2024-06-27 16:12:10,823][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_35000.pt [2024-06-27 16:12:16,082][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_35000.pt (epoch 2 @ 35000 updates, score 85.414) (writing took 8.544590634061024 seconds) [2024-06-27 16:18:20,870][train_inner][INFO] - {"epoch": 2, "update": 1.167, "loss": "1.081", "ntokens": "126.77", "acc_total": "126.77", "n_correct": "107.08", "wer_total": "126.77", "n_error": "19.685", "ppl": "2.12", "accuracy": "84.468", "wer": "15.528", "wps": "8.8", "ups": "0.07", "wpb": "126.8", "bsz": "8", "num_updates": "35200", "lr": "0.000160169", "gnorm": "3.421", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "99428"} [2024-06-27 16:24:25,768][train_inner][INFO] - {"epoch": 2, "update": 1.174, "loss": "1.11", "ntokens": "125.895", "acc_total": "125.895", "n_correct": "105.795", "wer_total": "125.895", "n_error": "20.085", "ppl": "2.16", "accuracy": "84.034", "wer": "15.954", "wps": "69", "ups": "0.55", "wpb": "125.9", "bsz": "8", "num_updates": "35400", "lr": "0.000157788", "gnorm": "3.337", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "99793"} [2024-06-27 16:30:30,891][train_inner][INFO] - {"epoch": 2, "update": 1.18, "loss": "1.084", "ntokens": "127.075", "acc_total": "127.075", "n_correct": "107.115", "wer_total": "127.075", "n_error": "19.96", "ppl": "2.12", "accuracy": "84.293", "wer": "15.707", "wps": "69.6", "ups": "0.55", "wpb": "127.1", "bsz": "8", "num_updates": "35600", "lr": "0.000155442", "gnorm": "3.432", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "100158"} [2024-06-27 16:36:36,115][train_inner][INFO] - {"epoch": 2, "update": 1.187, "loss": "1.087", "ntokens": "127.47", "acc_total": "127.47", "n_correct": "107.675", "wer_total": "127.47", "n_error": "19.785", "ppl": "2.12", "accuracy": "84.471", "wer": "15.521", "wps": "69.8", "ups": "0.55", "wpb": "127.5", "bsz": "8", "num_updates": "35800", "lr": "0.000153131", "gnorm": "3.383", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "100523"} [2024-06-27 16:42:41,095][train_inner][INFO] - {"epoch": 2, "update": 1.194, "loss": "1.112", "ntokens": "126.495", "acc_total": "126.495", "n_correct": "106.585", "wer_total": "126.495", "n_error": "19.895", "ppl": "2.16", "accuracy": "84.26", "wer": "15.728", "wps": "69.3", "ups": "0.55", "wpb": "126.5", "bsz": "8", "num_updates": "36000", "lr": "0.000150854", "gnorm": "3.501", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "100888"} [2024-06-27 16:48:46,187][train_inner][INFO] - {"epoch": 2, "update": 1.2, "loss": "1.099", "ntokens": "126.42", "acc_total": "126.42", "n_correct": "106.505", "wer_total": "126.42", "n_error": "19.91", "ppl": "2.14", "accuracy": "84.247", "wer": "15.749", "wps": "69.3", "ups": "0.55", "wpb": "126.4", "bsz": "8", "num_updates": "36200", "lr": "0.000148612", "gnorm": "3.317", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "101253"} [2024-06-27 16:54:51,108][train_inner][INFO] - {"epoch": 2, "update": 1.207, "loss": "1.129", "ntokens": "126.195", "acc_total": "126.195", "n_correct": "105.93", "wer_total": "126.195", "n_error": "20.255", "ppl": "2.19", "accuracy": "83.942", "wer": "16.051", "wps": "69.2", "ups": "0.55", "wpb": "126.2", "bsz": "8", "num_updates": "36400", "lr": "0.000146402", "gnorm": "3.406", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "101618"} [2024-06-27 17:00:56,111][train_inner][INFO] - {"epoch": 2, "update": 1.214, "loss": "1.078", "ntokens": "126.53", "acc_total": "126.53", "n_correct": "106.8", "wer_total": "126.53", "n_error": "19.715", "ppl": "2.11", "accuracy": "84.407", "wer": "15.581", "wps": "69.3", "ups": "0.55", "wpb": "126.5", "bsz": "8", "num_updates": "36600", "lr": "0.000144226", "gnorm": "3.282", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "101983"} [2024-06-27 17:07:01,187][train_inner][INFO] - {"epoch": 2, "update": 1.22, "loss": "1.066", "ntokens": "126.695", "acc_total": "126.695", "n_correct": "107.285", "wer_total": "126.695", "n_error": "19.41", "ppl": "2.09", "accuracy": "84.68", "wer": "15.32", "wps": "69.4", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "36800", "lr": "0.000142081", "gnorm": "3.425", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "102348"} [2024-06-27 17:13:06,340][train_inner][INFO] - {"epoch": 2, "update": 1.227, "loss": "1.063", "ntokens": "127.73", "acc_total": "127.73", "n_correct": "108.195", "wer_total": "127.73", "n_error": "19.525", "ppl": "2.09", "accuracy": "84.706", "wer": "15.286", "wps": "70", "ups": "0.55", "wpb": "127.7", "bsz": "8", "num_updates": "37000", "lr": "0.000139969", "gnorm": "3.351", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "102713"} [2024-06-27 17:19:11,106][train_inner][INFO] - {"epoch": 2, "update": 1.233, "loss": "1.035", "ntokens": "126.57", "acc_total": "126.57", "n_correct": "107.24", "wer_total": "126.57", "n_error": "19.325", "ppl": "2.05", "accuracy": "84.728", "wer": "15.268", "wps": "69.4", "ups": "0.55", "wpb": "126.6", "bsz": "8", "num_updates": "37200", "lr": "0.000137888", "gnorm": "3.36", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "103078"} [2024-06-27 17:25:15,927][train_inner][INFO] - {"epoch": 2, "update": 1.24, "loss": "1.035", "ntokens": "127.18", "acc_total": "127.18", "n_correct": "107.445", "wer_total": "127.18", "n_error": "19.73", "ppl": "2.05", "accuracy": "84.483", "wer": "15.513", "wps": "69.7", "ups": "0.55", "wpb": "127.2", "bsz": "8", "num_updates": "37400", "lr": "0.000135838", "gnorm": "3.213", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "103443"} [2024-06-27 17:28:18,407][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 18:10:19,446][valid][INFO] - {"epoch": 2, "valid_loss": "0.946", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.5443", "valid_wer_total": "18.1585", "valid_n_error": "2.61346", "valid_ppl": "1.93", "valid_accuracy": "85.603", "valid_wer": "14.393", "valid_wps": "172.8", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "37500", "valid_best_accuracy": "85.603"} [2024-06-27 18:10:19,447][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 37500 updates [2024-06-27 18:10:19,447][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_37500.pt [2024-06-27 18:10:22,785][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_37500.pt [2024-06-27 18:10:28,216][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_37500.pt (epoch 2 @ 37500 updates, score 85.603) (writing took 8.769449746003374 seconds) [2024-06-27 18:13:30,212][train_inner][INFO] - {"epoch": 2, "update": 1.247, "loss": "1.063", "ntokens": "126.79", "acc_total": "126.79", "n_correct": "106.92", "wer_total": "126.79", "n_error": "19.855", "ppl": "2.09", "accuracy": "84.328", "wer": "15.66", "wps": "8.8", "ups": "0.07", "wpb": "126.8", "bsz": "8", "num_updates": "37600", "lr": "0.000133819", "gnorm": "3.349", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "106337"} [2024-06-27 18:19:35,188][train_inner][INFO] - {"epoch": 2, "update": 1.253, "loss": "1.05", "ntokens": "126.85", "acc_total": "126.85", "n_correct": "107.58", "wer_total": "126.85", "n_error": "19.26", "ppl": "2.07", "accuracy": "84.809", "wer": "15.183", "wps": "69.5", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "37800", "lr": "0.000131829", "gnorm": "3.24", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "106702"} [2024-06-27 18:25:39,475][train_inner][INFO] - {"epoch": 2, "update": 1.26, "loss": "1.018", "ntokens": "127.83", "acc_total": "127.83", "n_correct": "108.865", "wer_total": "127.83", "n_error": "18.965", "ppl": "2.02", "accuracy": "85.164", "wer": "14.836", "wps": "70.2", "ups": "0.55", "wpb": "127.8", "bsz": "8", "num_updates": "38000", "lr": "0.000129869", "gnorm": "3.138", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "107066"} [2024-06-27 18:26:21,320][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0 [2024-06-27 18:31:45,528][train_inner][INFO] - {"epoch": 2, "update": 1.267, "loss": "1.063", "ntokens": "126.44", "acc_total": "126.44", "n_correct": "106.86", "wer_total": "126.44", "n_error": "19.58", "ppl": "2.09", "accuracy": "84.514", "wer": "15.486", "wps": "69.1", "ups": "0.55", "wpb": "126.4", "bsz": "8", "num_updates": "38200", "lr": "0.000127939", "gnorm": "3.247", "loss_scale": "2048", "train_wall": "365", "gb_free": "6.5", "wall": "107432"} [2024-06-27 18:37:49,928][train_inner][INFO] - {"epoch": 2, "update": 1.273, "loss": "1.035", "ntokens": "126.83", "acc_total": "126.83", "n_correct": "107.82", "wer_total": "126.83", "n_error": "19.005", "ppl": "2.05", "accuracy": "85.011", "wer": "14.985", "wps": "69.6", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "38400", "lr": "0.000126036", "gnorm": "3.15", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "107797"} [2024-06-27 18:43:54,455][train_inner][INFO] - {"epoch": 2, "update": 1.28, "loss": "1.063", "ntokens": "127.08", "acc_total": "127.08", "n_correct": "107.29", "wer_total": "127.08", "n_error": "19.785", "ppl": "2.09", "accuracy": "84.427", "wer": "15.569", "wps": "69.7", "ups": "0.55", "wpb": "127.1", "bsz": "8", "num_updates": "38600", "lr": "0.000124163", "gnorm": "3.171", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "108161"} [2024-06-27 18:49:59,241][train_inner][INFO] - {"epoch": 2, "update": 1.286, "loss": "1.03", "ntokens": "126.48", "acc_total": "126.48", "n_correct": "107.27", "wer_total": "126.48", "n_error": "19.21", "ppl": "2.04", "accuracy": "84.812", "wer": "15.188", "wps": "69.3", "ups": "0.55", "wpb": "126.5", "bsz": "8", "num_updates": "38800", "lr": "0.000122317", "gnorm": "3.249", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "108526"} [2024-06-27 18:56:03,691][train_inner][INFO] - {"epoch": 2, "update": 1.293, "loss": "1.037", "ntokens": "127.025", "acc_total": "127.025", "n_correct": "107.965", "wer_total": "127.025", "n_error": "19.055", "ppl": "2.05", "accuracy": "84.995", "wer": "15.001", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "39000", "lr": "0.000120498", "gnorm": "3.205", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "108890"} [2024-06-27 19:02:07,911][train_inner][INFO] - {"epoch": 2, "update": 1.3, "loss": "1.079", "ntokens": "127.4", "acc_total": "127.4", "n_correct": "107.605", "wer_total": "127.4", "n_error": "19.78", "ppl": "2.11", "accuracy": "84.462", "wer": "15.526", "wps": "70", "ups": "0.55", "wpb": "127.4", "bsz": "8", "num_updates": "39200", "lr": "0.000118707", "gnorm": "3.479", "loss_scale": "2048", "train_wall": "363", "gb_free": "6.5", "wall": "109255"} [2024-06-27 19:08:12,274][train_inner][INFO] - {"epoch": 2, "update": 1.306, "loss": "1.047", "ntokens": "126.97", "acc_total": "126.97", "n_correct": "107.71", "wer_total": "126.97", "n_error": "19.25", "ppl": "2.07", "accuracy": "84.831", "wer": "15.161", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "39400", "lr": "0.000116942", "gnorm": "3.224", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "109619"} [2024-06-27 19:14:16,570][train_inner][INFO] - {"epoch": 2, "update": 1.313, "loss": "0.98", "ntokens": "127.06", "acc_total": "127.06", "n_correct": "108.705", "wer_total": "127.06", "n_error": "18.355", "ppl": "1.97", "accuracy": "85.554", "wer": "14.446", "wps": "69.8", "ups": "0.55", "wpb": "127.1", "bsz": "8", "num_updates": "39600", "lr": "0.000115203", "gnorm": "3.026", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "109983"} [2024-06-27 19:20:20,929][train_inner][INFO] - {"epoch": 2, "update": 1.32, "loss": "1.037", "ntokens": "127.33", "acc_total": "127.33", "n_correct": "108.105", "wer_total": "127.33", "n_error": "19.215", "ppl": "2.05", "accuracy": "84.901", "wer": "15.091", "wps": "69.9", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "39800", "lr": "0.000113491", "gnorm": "3.136", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "110348"} [2024-06-27 19:26:25,190][train_inner][INFO] - {"epoch": 2, "update": 1.326, "loss": "1.008", "ntokens": "127.635", "acc_total": "127.635", "n_correct": "109.185", "wer_total": "127.635", "n_error": "18.445", "ppl": "2.01", "accuracy": "85.545", "wer": "14.451", "wps": "70.1", "ups": "0.55", "wpb": "127.6", "bsz": "8", "num_updates": "40000", "lr": "0.000111803", "gnorm": "3.066", "loss_scale": "2048", "train_wall": "363", "gb_free": "6.5", "wall": "110712"} [2024-06-27 19:26:25,191][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 20:08:21,069][valid][INFO] - {"epoch": 2, "valid_loss": "0.911", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.6759", "valid_wer_total": "18.1585", "valid_n_error": "2.48191", "valid_ppl": "1.88", "valid_accuracy": "86.329", "valid_wer": "13.668", "valid_wps": "173.2", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "40000", "valid_best_accuracy": "86.329"} [2024-06-27 20:08:21,070][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 40000 updates [2024-06-27 20:08:21,070][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_40000.pt [2024-06-27 20:08:24,377][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_40000.pt [2024-06-27 20:08:29,800][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_40000.pt (epoch 2 @ 40000 updates, score 86.329) (writing took 8.730217472882941 seconds) [2024-06-27 20:14:34,044][train_inner][INFO] - {"epoch": 2, "update": 1.333, "loss": "1", "ntokens": "127.615", "acc_total": "127.615", "n_correct": "109.095", "wer_total": "127.615", "n_error": "18.505", "ppl": "2", "accuracy": "85.488", "wer": "14.501", "wps": "8.8", "ups": "0.07", "wpb": "127.6", "bsz": "8", "num_updates": "40200", "lr": "0.000110141", "gnorm": "3.186", "loss_scale": "4096", "train_wall": "363", "gb_free": "6.5", "wall": "113601"} [2024-06-27 20:16:59,720][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0 [2024-06-27 20:20:40,085][train_inner][INFO] - {"epoch": 2, "update": 1.34, "loss": "1.02", "ntokens": "126.7", "acc_total": "126.7", "n_correct": "107.87", "wer_total": "126.7", "n_error": "18.825", "ppl": "2.03", "accuracy": "85.138", "wer": "14.858", "wps": "69.2", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "40400", "lr": "0.000108504", "gnorm": "3.151", "loss_scale": "2048", "train_wall": "365", "gb_free": "6.5", "wall": "113967"} [2024-06-27 20:26:44,502][train_inner][INFO] - {"epoch": 2, "update": 1.346, "loss": "0.966", "ntokens": "126.975", "acc_total": "126.975", "n_correct": "108.54", "wer_total": "126.975", "n_error": "18.435", "ppl": "1.95", "accuracy": "85.481", "wer": "14.519", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "40600", "lr": "0.000106891", "gnorm": "3.127", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "114331"} [2024-06-27 20:29:46,553][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-27 20:32:50,570][train_inner][INFO] - {"epoch": 2, "update": 1.353, "loss": "0.995", "ntokens": "127.64", "acc_total": "127.64", "n_correct": "108.705", "wer_total": "127.64", "n_error": "18.925", "ppl": "1.99", "accuracy": "85.165", "wer": "14.827", "wps": "69.7", "ups": "0.55", "wpb": "127.6", "bsz": "8", "num_updates": "40800", "lr": "0.000105301", "gnorm": "3.191", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "114697"} [2024-06-27 20:38:54,939][train_inner][INFO] - {"epoch": 2, "update": 1.359, "loss": "0.973", "ntokens": "127.125", "acc_total": "127.125", "n_correct": "109", "wer_total": "127.125", "n_error": "18.125", "ppl": "1.96", "accuracy": "85.742", "wer": "14.258", "wps": "69.8", "ups": "0.55", "wpb": "127.1", "bsz": "8", "num_updates": "41000", "lr": "0.000103736", "gnorm": "3.159", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "115062"} [2024-06-27 20:44:59,574][train_inner][INFO] - {"epoch": 2, "update": 1.366, "loss": "1.034", "ntokens": "126.045", "acc_total": "126.045", "n_correct": "107.11", "wer_total": "126.045", "n_error": "18.935", "ppl": "2.05", "accuracy": "84.978", "wer": "15.022", "wps": "69.1", "ups": "0.55", "wpb": "126", "bsz": "8", "num_updates": "41200", "lr": "0.000102194", "gnorm": "3.218", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "115426"} [2024-06-27 20:51:03,938][train_inner][INFO] - {"epoch": 2, "update": 1.373, "loss": "1.017", "ntokens": "126.745", "acc_total": "126.745", "n_correct": "108.075", "wer_total": "126.745", "n_error": "18.67", "ppl": "2.02", "accuracy": "85.27", "wer": "14.73", "wps": "69.6", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "41400", "lr": "0.000100674", "gnorm": "3.264", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "115791"} [2024-06-27 20:57:08,383][train_inner][INFO] - {"epoch": 2, "update": 1.379, "loss": "1.022", "ntokens": "126.63", "acc_total": "126.63", "n_correct": "107.095", "wer_total": "126.63", "n_error": "19.53", "ppl": "2.03", "accuracy": "84.573", "wer": "15.423", "wps": "69.5", "ups": "0.55", "wpb": "126.6", "bsz": "8", "num_updates": "41600", "lr": "9.91776e-05", "gnorm": "3.184", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "116155"} [2024-06-27 21:03:12,683][train_inner][INFO] - {"epoch": 2, "update": 1.386, "loss": "1.003", "ntokens": "126.39", "acc_total": "126.39", "n_correct": "107.61", "wer_total": "126.39", "n_error": "18.78", "ppl": "2", "accuracy": "85.141", "wer": "14.859", "wps": "69.4", "ups": "0.55", "wpb": "126.4", "bsz": "8", "num_updates": "41800", "lr": "9.77032e-05", "gnorm": "3.056", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "116519"} [2024-06-27 21:09:16,994][train_inner][INFO] - {"epoch": 2, "update": 1.393, "loss": "0.939", "ntokens": "127.025", "acc_total": "127.025", "n_correct": "109.15", "wer_total": "127.025", "n_error": "17.87", "ppl": "1.92", "accuracy": "85.928", "wer": "14.068", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "42000", "lr": "9.62506e-05", "gnorm": "3.122", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "116884"} [2024-06-27 21:15:21,691][train_inner][INFO] - {"epoch": 2, "update": 1.399, "loss": "0.977", "ntokens": "127.29", "acc_total": "127.29", "n_correct": "109.22", "wer_total": "127.29", "n_error": "18.07", "ppl": "1.97", "accuracy": "85.804", "wer": "14.196", "wps": "69.8", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "42200", "lr": "9.48197e-05", "gnorm": "3.096", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "117248"} [2024-06-27 21:21:26,316][train_inner][INFO] - {"epoch": 2, "update": 1.406, "loss": "0.98", "ntokens": "128.865", "acc_total": "128.865", "n_correct": "110.255", "wer_total": "128.865", "n_error": "18.6", "ppl": "1.97", "accuracy": "85.559", "wer": "14.434", "wps": "70.7", "ups": "0.55", "wpb": "128.9", "bsz": "8", "num_updates": "42400", "lr": "9.341e-05", "gnorm": "3.115", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "117613"} [2024-06-27 21:24:28,589][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-27 22:06:26,339][valid][INFO] - {"epoch": 2, "valid_loss": "0.867", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.7907", "valid_wer_total": "18.1585", "valid_n_error": "2.36694", "valid_ppl": "1.82", "valid_accuracy": "86.961", "valid_wer": "13.035", "valid_wps": "173", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "42500", "valid_best_accuracy": "86.961"} [2024-06-27 22:06:26,340][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 42500 updates [2024-06-27 22:06:26,340][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_42500.pt [2024-06-27 22:06:29,666][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_42500.pt [2024-06-27 22:06:35,253][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_42500.pt (epoch 2 @ 42500 updates, score 86.961) (writing took 8.912540739169344 seconds) [2024-06-27 22:09:37,227][train_inner][INFO] - {"epoch": 2, "update": 1.413, "loss": "0.994", "ntokens": "126.51", "acc_total": "126.51", "n_correct": "108.425", "wer_total": "126.51", "n_error": "18.08", "ppl": "1.99", "accuracy": "85.705", "wer": "14.291", "wps": "8.8", "ups": "0.07", "wpb": "126.5", "bsz": "8", "num_updates": "42600", "lr": "9.20212e-05", "gnorm": "3.116", "loss_scale": "1024", "train_wall": "363", "gb_free": "6.5", "wall": "120504"} [2024-06-27 22:15:42,255][train_inner][INFO] - {"epoch": 2, "update": 1.419, "loss": "0.982", "ntokens": "126.65", "acc_total": "126.65", "n_correct": "108.265", "wer_total": "126.65", "n_error": "18.385", "ppl": "1.98", "accuracy": "85.484", "wer": "14.516", "wps": "69.4", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "42800", "lr": "9.06532e-05", "gnorm": "3.096", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "120869"} [2024-06-27 22:21:46,857][train_inner][INFO] - {"epoch": 2, "update": 1.426, "loss": "0.99", "ntokens": "127.265", "acc_total": "127.265", "n_correct": "108.715", "wer_total": "127.265", "n_error": "18.54", "ppl": "1.99", "accuracy": "85.424", "wer": "14.568", "wps": "69.8", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "43000", "lr": "8.93054e-05", "gnorm": "3.157", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "121234"} [2024-06-27 22:27:51,497][train_inner][INFO] - {"epoch": 2, "update": 1.432, "loss": "1.033", "ntokens": "126.57", "acc_total": "126.57", "n_correct": "107.41", "wer_total": "126.57", "n_error": "19.15", "ppl": "2.05", "accuracy": "84.862", "wer": "15.13", "wps": "69.4", "ups": "0.55", "wpb": "126.6", "bsz": "8", "num_updates": "43200", "lr": "8.79777e-05", "gnorm": "3.182", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "121598"} [2024-06-27 22:33:56,193][train_inner][INFO] - {"epoch": 2, "update": 1.439, "loss": "0.993", "ntokens": "126.195", "acc_total": "126.195", "n_correct": "107.84", "wer_total": "126.195", "n_error": "18.35", "ppl": "1.99", "accuracy": "85.455", "wer": "14.541", "wps": "69.2", "ups": "0.55", "wpb": "126.2", "bsz": "8", "num_updates": "43400", "lr": "8.66697e-05", "gnorm": "3.168", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "121963"} [2024-06-27 22:40:00,473][train_inner][INFO] - {"epoch": 2, "update": 1.446, "loss": "0.976", "ntokens": "127.27", "acc_total": "127.27", "n_correct": "109.16", "wer_total": "127.27", "n_error": "18.11", "ppl": "1.97", "accuracy": "85.77", "wer": "14.23", "wps": "69.9", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "43600", "lr": "8.53812e-05", "gnorm": "3.087", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "122327"} [2024-06-27 22:41:09,701][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-27 22:46:06,942][train_inner][INFO] - {"epoch": 2, "update": 1.452, "loss": "0.991", "ntokens": "126.085", "acc_total": "126.085", "n_correct": "107.955", "wer_total": "126.085", "n_error": "18.13", "ppl": "1.99", "accuracy": "85.621", "wer": "14.379", "wps": "68.8", "ups": "0.55", "wpb": "126.1", "bsz": "8", "num_updates": "43800", "lr": "8.41118e-05", "gnorm": "3.091", "loss_scale": "1024", "train_wall": "366", "gb_free": "6.5", "wall": "122694"} [2024-06-27 22:52:11,903][train_inner][INFO] - {"epoch": 2, "update": 1.459, "loss": "0.959", "ntokens": "127.58", "acc_total": "127.58", "n_correct": "109.96", "wer_total": "127.58", "n_error": "17.62", "ppl": "1.94", "accuracy": "86.189", "wer": "13.811", "wps": "69.9", "ups": "0.55", "wpb": "127.6", "bsz": "8", "num_updates": "44000", "lr": "8.28614e-05", "gnorm": "3.032", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "123059"} [2024-06-27 22:58:16,535][train_inner][INFO] - {"epoch": 2, "update": 1.466, "loss": "0.982", "ntokens": "126.85", "acc_total": "126.85", "n_correct": "108.74", "wer_total": "126.85", "n_error": "18.11", "ppl": "1.97", "accuracy": "85.723", "wer": "14.277", "wps": "69.6", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "44200", "lr": "8.16294e-05", "gnorm": "3.057", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "123423"} [2024-06-27 23:04:20,942][train_inner][INFO] - {"epoch": 2, "update": 1.472, "loss": "0.937", "ntokens": "127.23", "acc_total": "127.23", "n_correct": "109.63", "wer_total": "127.23", "n_error": "17.59", "ppl": "1.91", "accuracy": "86.167", "wer": "13.825", "wps": "69.8", "ups": "0.55", "wpb": "127.2", "bsz": "8", "num_updates": "44400", "lr": "8.04159e-05", "gnorm": "3.041", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "123788"} [2024-06-27 23:10:25,276][train_inner][INFO] - {"epoch": 2, "update": 1.479, "loss": "0.96", "ntokens": "126.815", "acc_total": "126.815", "n_correct": "108.93", "wer_total": "126.815", "n_error": "17.88", "ppl": "1.94", "accuracy": "85.897", "wer": "14.099", "wps": "69.6", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "44600", "lr": "7.92203e-05", "gnorm": "2.975", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "124152"} [2024-06-27 23:16:30,026][train_inner][INFO] - {"epoch": 2, "update": 1.485, "loss": "0.968", "ntokens": "127.675", "acc_total": "127.675", "n_correct": "109.58", "wer_total": "127.675", "n_error": "18.095", "ppl": "1.96", "accuracy": "85.827", "wer": "14.173", "wps": "70", "ups": "0.55", "wpb": "127.7", "bsz": "8", "num_updates": "44800", "lr": "7.80425e-05", "gnorm": "3.026", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "124517"} [2024-06-27 23:22:34,476][train_inner][INFO] - {"epoch": 2, "update": 1.492, "loss": "0.933", "ntokens": "127.265", "acc_total": "127.265", "n_correct": "110.05", "wer_total": "127.265", "n_error": "17.21", "ppl": "1.91", "accuracy": "86.473", "wer": "13.523", "wps": "69.8", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "45000", "lr": "7.68823e-05", "gnorm": "3.09", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "124881"} [2024-06-27 23:22:34,477][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-28 00:04:28,462][valid][INFO] - {"epoch": 2, "valid_loss": "0.854", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.7741", "valid_wer_total": "18.1585", "valid_n_error": "2.38374", "valid_ppl": "1.81", "valid_accuracy": "86.869", "valid_wer": "13.127", "valid_wps": "173.3", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "45000", "valid_best_accuracy": "86.961"} [2024-06-28 00:04:28,463][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 45000 updates [2024-06-28 00:04:28,463][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_45000.pt [2024-06-28 00:04:31,788][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_45000.pt [2024-06-28 00:04:34,274][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_45000.pt (epoch 2 @ 45000 updates, score 86.869) (writing took 5.810438963118941 seconds) [2024-06-28 00:10:38,563][train_inner][INFO] - {"epoch": 2, "update": 1.499, "loss": "0.968", "ntokens": "126.595", "acc_total": "126.595", "n_correct": "108.93", "wer_total": "126.595", "n_error": "17.665", "ppl": "1.96", "accuracy": "86.046", "wer": "13.954", "wps": "8.8", "ups": "0.07", "wpb": "126.6", "bsz": "8", "num_updates": "45200", "lr": "7.57393e-05", "gnorm": "3.102", "loss_scale": "1024", "train_wall": "363", "gb_free": "6.5", "wall": "127765"} [2024-06-28 00:16:42,731][train_inner][INFO] - {"epoch": 2, "update": 1.505, "loss": "0.931", "ntokens": "126.74", "acc_total": "126.74", "n_correct": "109.685", "wer_total": "126.74", "n_error": "17.055", "ppl": "1.91", "accuracy": "86.543", "wer": "13.457", "wps": "69.6", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "45400", "lr": "7.46132e-05", "gnorm": "2.91", "loss_scale": "1024", "train_wall": "363", "gb_free": "6.5", "wall": "128130"} [2024-06-28 00:22:47,088][train_inner][INFO] - {"epoch": 2, "update": 1.512, "loss": "0.935", "ntokens": "126.795", "acc_total": "126.795", "n_correct": "109.375", "wer_total": "126.795", "n_error": "17.42", "ppl": "1.91", "accuracy": "86.261", "wer": "13.739", "wps": "69.6", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "45600", "lr": "7.3504e-05", "gnorm": "2.902", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "128494"} [2024-06-28 00:28:51,324][train_inner][INFO] - {"epoch": 2, "update": 1.519, "loss": "0.918", "ntokens": "127.1", "acc_total": "127.1", "n_correct": "109.545", "wer_total": "127.1", "n_error": "17.55", "ppl": "1.89", "accuracy": "86.188", "wer": "13.808", "wps": "69.8", "ups": "0.55", "wpb": "127.1", "bsz": "8", "num_updates": "45800", "lr": "7.24112e-05", "gnorm": "2.994", "loss_scale": "2048", "train_wall": "363", "gb_free": "6.5", "wall": "128858"} [2024-06-28 00:34:53,897][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-28 00:34:57,552][train_inner][INFO] - {"epoch": 2, "update": 1.525, "loss": "0.915", "ntokens": "127.21", "acc_total": "127.21", "n_correct": "109.88", "wer_total": "127.21", "n_error": "17.32", "ppl": "1.89", "accuracy": "86.377", "wer": "13.615", "wps": "69.5", "ups": "0.55", "wpb": "127.2", "bsz": "8", "num_updates": "46000", "lr": "7.13346e-05", "gnorm": "2.877", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "129224"} [2024-06-28 00:41:01,936][train_inner][INFO] - {"epoch": 2, "update": 1.532, "loss": "0.962", "ntokens": "127.38", "acc_total": "127.38", "n_correct": "109.6", "wer_total": "127.38", "n_error": "17.78", "ppl": "1.95", "accuracy": "86.042", "wer": "13.958", "wps": "69.9", "ups": "0.55", "wpb": "127.4", "bsz": "8", "num_updates": "46200", "lr": "7.02741e-05", "gnorm": "3.081", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "129589"} [2024-06-28 00:47:06,626][train_inner][INFO] - {"epoch": 2, "update": 1.539, "loss": "0.914", "ntokens": "126.895", "acc_total": "126.895", "n_correct": "110.215", "wer_total": "126.895", "n_error": "16.675", "ppl": "1.88", "accuracy": "86.855", "wer": "13.141", "wps": "69.6", "ups": "0.55", "wpb": "126.9", "bsz": "8", "num_updates": "46400", "lr": "6.92293e-05", "gnorm": "3.01", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "129953"} [2024-06-28 00:53:11,134][train_inner][INFO] - {"epoch": 2, "update": 1.545, "loss": "0.927", "ntokens": "128.235", "acc_total": "128.235", "n_correct": "110.895", "wer_total": "128.235", "n_error": "17.34", "ppl": "1.9", "accuracy": "86.478", "wer": "13.522", "wps": "70.4", "ups": "0.55", "wpb": "128.2", "bsz": "8", "num_updates": "46600", "lr": "6.82001e-05", "gnorm": "3.065", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "130318"} [2024-06-28 00:59:15,760][train_inner][INFO] - {"epoch": 2, "update": 1.552, "loss": "0.885", "ntokens": "126.735", "acc_total": "126.735", "n_correct": "110.375", "wer_total": "126.735", "n_error": "16.36", "ppl": "1.85", "accuracy": "87.091", "wer": "12.909", "wps": "69.5", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "46800", "lr": "6.71862e-05", "gnorm": "2.982", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "130683"} [2024-06-28 01:05:20,268][train_inner][INFO] - {"epoch": 2, "update": 1.558, "loss": "0.903", "ntokens": "126.69", "acc_total": "126.69", "n_correct": "110", "wer_total": "126.69", "n_error": "16.685", "ppl": "1.87", "accuracy": "86.826", "wer": "13.17", "wps": "69.5", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "47000", "lr": "6.61873e-05", "gnorm": "2.87", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "131047"} [2024-06-28 01:11:24,575][train_inner][INFO] - {"epoch": 2, "update": 1.565, "loss": "0.916", "ntokens": "127.005", "acc_total": "127.005", "n_correct": "109.875", "wer_total": "127.005", "n_error": "17.125", "ppl": "1.89", "accuracy": "86.512", "wer": "13.484", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "47200", "lr": "6.52033e-05", "gnorm": "2.871", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "131411"} [2024-06-28 01:17:29,108][train_inner][INFO] - {"epoch": 2, "update": 1.572, "loss": "0.896", "ntokens": "127.11", "acc_total": "127.11", "n_correct": "110.29", "wer_total": "127.11", "n_error": "16.81", "ppl": "1.86", "accuracy": "86.767", "wer": "13.225", "wps": "69.7", "ups": "0.55", "wpb": "127.1", "bsz": "8", "num_updates": "47400", "lr": "6.42339e-05", "gnorm": "2.889", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "131776"} [2024-06-28 01:20:31,312][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-28 02:02:27,359][valid][INFO] - {"epoch": 2, "valid_loss": "0.832", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.845", "valid_wer_total": "18.1585", "valid_n_error": "2.31271", "valid_ppl": "1.78", "valid_accuracy": "87.26", "valid_wer": "12.736", "valid_wps": "173.1", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "47500", "valid_best_accuracy": "87.26"} [2024-06-28 02:02:27,360][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 47500 updates [2024-06-28 02:02:27,360][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_47500.pt [2024-06-28 02:02:30,663][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_47500.pt [2024-06-28 02:02:36,000][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_47500.pt (epoch 2 @ 47500 updates, score 87.26) (writing took 8.640355909010395 seconds) [2024-06-28 02:05:37,973][train_inner][INFO] - {"epoch": 2, "update": 1.578, "loss": "0.938", "ntokens": "127.57", "acc_total": "127.57", "n_correct": "110.02", "wer_total": "127.57", "n_error": "17.55", "ppl": "1.92", "accuracy": "86.243", "wer": "13.757", "wps": "8.8", "ups": "0.07", "wpb": "127.6", "bsz": "8", "num_updates": "47600", "lr": "6.3279e-05", "gnorm": "3.077", "loss_scale": "1024", "train_wall": "363", "gb_free": "6.5", "wall": "134665"} [2024-06-28 02:11:42,790][train_inner][INFO] - {"epoch": 2, "update": 1.585, "loss": "0.883", "ntokens": "127.5", "acc_total": "127.5", "n_correct": "111.165", "wer_total": "127.5", "n_error": "16.335", "ppl": "1.84", "accuracy": "87.188", "wer": "12.812", "wps": "69.9", "ups": "0.55", "wpb": "127.5", "bsz": "8", "num_updates": "47800", "lr": "6.23382e-05", "gnorm": "2.802", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "135030"} [2024-06-28 02:17:47,460][train_inner][INFO] - {"epoch": 2, "update": 1.592, "loss": "0.951", "ntokens": "127.97", "acc_total": "127.97", "n_correct": "110.14", "wer_total": "127.97", "n_error": "17.825", "ppl": "1.93", "accuracy": "86.067", "wer": "13.929", "wps": "70.2", "ups": "0.55", "wpb": "128", "bsz": "8", "num_updates": "48000", "lr": "6.14114e-05", "gnorm": "3.002", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "135394"} [2024-06-28 02:23:52,096][train_inner][INFO] - {"epoch": 2, "update": 1.598, "loss": "0.915", "ntokens": "126.53", "acc_total": "126.53", "n_correct": "109.39", "wer_total": "126.53", "n_error": "17.135", "ppl": "1.89", "accuracy": "86.454", "wer": "13.542", "wps": "69.4", "ups": "0.55", "wpb": "126.5", "bsz": "8", "num_updates": "48200", "lr": "6.04984e-05", "gnorm": "3.02", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "135759"} [2024-06-28 02:29:56,466][train_inner][INFO] - {"epoch": 2, "update": 1.605, "loss": "0.894", "ntokens": "126.305", "acc_total": "126.305", "n_correct": "109.565", "wer_total": "126.305", "n_error": "16.74", "ppl": "1.86", "accuracy": "86.746", "wer": "13.254", "wps": "69.3", "ups": "0.55", "wpb": "126.3", "bsz": "8", "num_updates": "48400", "lr": "5.9599e-05", "gnorm": "3.011", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "136123"} [2024-06-28 02:36:01,051][train_inner][INFO] - {"epoch": 2, "update": 1.611, "loss": "0.887", "ntokens": "126.935", "acc_total": "126.935", "n_correct": "110.575", "wer_total": "126.935", "n_error": "16.36", "ppl": "1.85", "accuracy": "87.112", "wer": "12.888", "wps": "69.6", "ups": "0.55", "wpb": "126.9", "bsz": "8", "num_updates": "48600", "lr": "5.87129e-05", "gnorm": "2.883", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "136488"} [2024-06-28 02:42:05,446][train_inner][INFO] - {"epoch": 2, "update": 1.618, "loss": "0.887", "ntokens": "126.17", "acc_total": "126.17", "n_correct": "109.545", "wer_total": "126.17", "n_error": "16.62", "ppl": "1.85", "accuracy": "86.823", "wer": "13.173", "wps": "69.2", "ups": "0.55", "wpb": "126.2", "bsz": "8", "num_updates": "48800", "lr": "5.784e-05", "gnorm": "2.876", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "136852"} [2024-06-28 02:47:40,903][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-28 02:48:11,885][train_inner][INFO] - {"epoch": 2, "update": 1.625, "loss": "0.915", "ntokens": "126.95", "acc_total": "126.95", "n_correct": "109.945", "wer_total": "126.95", "n_error": "17.005", "ppl": "1.88", "accuracy": "86.605", "wer": "13.395", "wps": "69.3", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "49000", "lr": "5.69801e-05", "gnorm": "2.959", "loss_scale": "1024", "train_wall": "366", "gb_free": "6.5", "wall": "137219"} [2024-06-28 02:54:16,446][train_inner][INFO] - {"epoch": 2, "update": 1.631, "loss": "0.886", "ntokens": "126.97", "acc_total": "126.97", "n_correct": "110.475", "wer_total": "126.97", "n_error": "16.485", "ppl": "1.85", "accuracy": "87.009", "wer": "12.983", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "49200", "lr": "5.6133e-05", "gnorm": "2.997", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "137583"} [2024-06-28 03:00:21,167][train_inner][INFO] - {"epoch": 2, "update": 1.638, "loss": "0.87", "ntokens": "127.49", "acc_total": "127.49", "n_correct": "111.065", "wer_total": "127.49", "n_error": "16.405", "ppl": "1.83", "accuracy": "87.117", "wer": "12.868", "wps": "69.9", "ups": "0.55", "wpb": "127.5", "bsz": "8", "num_updates": "49400", "lr": "5.52984e-05", "gnorm": "3.067", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "137948"} [2024-06-28 03:06:25,667][train_inner][INFO] - {"epoch": 2, "update": 1.645, "loss": "0.924", "ntokens": "127.355", "acc_total": "127.355", "n_correct": "110.08", "wer_total": "127.355", "n_error": "17.27", "ppl": "1.9", "accuracy": "86.436", "wer": "13.561", "wps": "69.9", "ups": "0.55", "wpb": "127.4", "bsz": "8", "num_updates": "49600", "lr": "5.44763e-05", "gnorm": "3.006", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "138312"} [2024-06-28 03:12:30,594][train_inner][INFO] - {"epoch": 2, "update": 1.651, "loss": "0.918", "ntokens": "127.295", "acc_total": "127.295", "n_correct": "110.005", "wer_total": "127.295", "n_error": "17.29", "ppl": "1.89", "accuracy": "86.417", "wer": "13.583", "wps": "69.8", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "49800", "lr": "5.36664e-05", "gnorm": "2.945", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "138677"} [2024-06-28 03:18:35,326][train_inner][INFO] - {"epoch": 2, "update": 1.658, "loss": "0.892", "ntokens": "127.11", "acc_total": "127.11", "n_correct": "110.48", "wer_total": "127.11", "n_error": "16.625", "ppl": "1.86", "accuracy": "86.917", "wer": "13.079", "wps": "69.7", "ups": "0.55", "wpb": "127.1", "bsz": "8", "num_updates": "50000", "lr": "5.28686e-05", "gnorm": "2.934", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "139042"} [2024-06-28 03:18:35,326][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-28 04:00:31,355][valid][INFO] - {"epoch": 2, "valid_loss": "0.804", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.9353", "valid_wer_total": "18.1585", "valid_n_error": "2.22255", "valid_ppl": "1.75", "valid_accuracy": "87.757", "valid_wer": "12.24", "valid_wps": "173.1", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "50000", "valid_best_accuracy": "87.757"} [2024-06-28 04:00:31,356][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 50000 updates [2024-06-28 04:00:31,357][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_50000.pt [2024-06-28 04:00:34,638][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_50000.pt [2024-06-28 04:00:39,996][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_50000.pt (epoch 2 @ 50000 updates, score 87.757) (writing took 8.639687242917717 seconds) [2024-06-28 04:06:44,396][train_inner][INFO] - {"epoch": 2, "update": 1.665, "loss": "0.893", "ntokens": "126.53", "acc_total": "126.53", "n_correct": "109.905", "wer_total": "126.53", "n_error": "16.62", "ppl": "1.86", "accuracy": "86.861", "wer": "13.135", "wps": "8.8", "ups": "0.07", "wpb": "126.5", "bsz": "8", "num_updates": "50200", "lr": "5.20826e-05", "gnorm": "2.972", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "141931"} [2024-06-28 04:12:48,743][train_inner][INFO] - {"epoch": 2, "update": 1.671, "loss": "0.903", "ntokens": "126.8", "acc_total": "126.8", "n_correct": "109.85", "wer_total": "126.8", "n_error": "16.95", "ppl": "1.87", "accuracy": "86.632", "wer": "13.368", "wps": "69.6", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "50400", "lr": "5.13083e-05", "gnorm": "2.884", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "142296"} [2024-06-28 04:18:53,144][train_inner][INFO] - {"epoch": 2, "update": 1.678, "loss": "0.895", "ntokens": "125.94", "acc_total": "125.94", "n_correct": "108.795", "wer_total": "125.94", "n_error": "17.135", "ppl": "1.86", "accuracy": "86.386", "wer": "13.606", "wps": "69.1", "ups": "0.55", "wpb": "125.9", "bsz": "8", "num_updates": "50600", "lr": "5.05454e-05", "gnorm": "3.002", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "142660"} [2024-06-28 04:24:58,013][train_inner][INFO] - {"epoch": 2, "update": 1.684, "loss": "0.892", "ntokens": "127.035", "acc_total": "127.035", "n_correct": "109.935", "wer_total": "127.035", "n_error": "17.1", "ppl": "1.86", "accuracy": "86.539", "wer": "13.461", "wps": "69.6", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "50800", "lr": "4.9794e-05", "gnorm": "2.932", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "143025"} [2024-06-28 04:31:02,589][train_inner][INFO] - {"epoch": 2, "update": 1.691, "loss": "0.907", "ntokens": "127.885", "acc_total": "127.885", "n_correct": "110.935", "wer_total": "127.885", "n_error": "16.945", "ppl": "1.87", "accuracy": "86.746", "wer": "13.25", "wps": "70.2", "ups": "0.55", "wpb": "127.9", "bsz": "8", "num_updates": "51000", "lr": "4.90537e-05", "gnorm": "2.916", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "143389"} [2024-06-28 04:37:07,049][train_inner][INFO] - {"epoch": 2, "update": 1.698, "loss": "0.855", "ntokens": "126.6", "acc_total": "126.6", "n_correct": "110.64", "wer_total": "126.6", "n_error": "15.95", "ppl": "1.81", "accuracy": "87.393", "wer": "12.599", "wps": "69.5", "ups": "0.55", "wpb": "126.6", "bsz": "8", "num_updates": "51200", "lr": "4.83244e-05", "gnorm": "2.789", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "143754"} [2024-06-28 04:43:11,298][train_inner][INFO] - {"epoch": 2, "update": 1.704, "loss": "0.893", "ntokens": "127.465", "acc_total": "127.465", "n_correct": "110.84", "wer_total": "127.465", "n_error": "16.625", "ppl": "1.86", "accuracy": "86.957", "wer": "13.043", "wps": "70", "ups": "0.55", "wpb": "127.5", "bsz": "8", "num_updates": "51400", "lr": "4.7606e-05", "gnorm": "2.889", "loss_scale": "2048", "train_wall": "363", "gb_free": "6.5", "wall": "144118"} [2024-06-28 04:49:15,656][train_inner][INFO] - {"epoch": 2, "update": 1.711, "loss": "0.896", "ntokens": "126.57", "acc_total": "126.57", "n_correct": "110.06", "wer_total": "126.57", "n_error": "16.51", "ppl": "1.86", "accuracy": "86.956", "wer": "13.044", "wps": "69.5", "ups": "0.55", "wpb": "126.6", "bsz": "8", "num_updates": "51600", "lr": "4.68982e-05", "gnorm": "2.84", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "144482"} [2024-06-28 04:55:20,071][train_inner][INFO] - {"epoch": 2, "update": 1.718, "loss": "0.891", "ntokens": "126.61", "acc_total": "126.61", "n_correct": "109.405", "wer_total": "126.61", "n_error": "17.2", "ppl": "1.85", "accuracy": "86.411", "wer": "13.585", "wps": "69.5", "ups": "0.55", "wpb": "126.6", "bsz": "8", "num_updates": "51800", "lr": "4.6201e-05", "gnorm": "3.064", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "144847"} [2024-06-28 05:01:24,519][train_inner][INFO] - {"epoch": 2, "update": 1.724, "loss": "0.897", "ntokens": "127.01", "acc_total": "127.01", "n_correct": "110.11", "wer_total": "127.01", "n_error": "16.895", "ppl": "1.86", "accuracy": "86.694", "wer": "13.302", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "52000", "lr": "4.55141e-05", "gnorm": "2.87", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "145211"} [2024-06-28 05:07:28,932][train_inner][INFO] - {"epoch": 2, "update": 1.731, "loss": "0.906", "ntokens": "126.955", "acc_total": "126.955", "n_correct": "109.685", "wer_total": "126.955", "n_error": "17.265", "ppl": "1.87", "accuracy": "86.397", "wer": "13.599", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "52200", "lr": "4.48374e-05", "gnorm": "2.958", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "145576"} [2024-06-28 05:13:33,128][train_inner][INFO] - {"epoch": 2, "update": 1.737, "loss": "0.888", "ntokens": "126.815", "acc_total": "126.815", "n_correct": "109.92", "wer_total": "126.815", "n_error": "16.89", "ppl": "1.85", "accuracy": "86.677", "wer": "13.319", "wps": "69.6", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "52400", "lr": "4.41708e-05", "gnorm": "2.829", "loss_scale": "2048", "train_wall": "363", "gb_free": "6.5", "wall": "145940"} [2024-06-28 05:16:35,295][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-28 05:58:37,233][valid][INFO] - {"epoch": 2, "valid_loss": "0.795", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.9495", "valid_wer_total": "18.1585", "valid_n_error": "2.20834", "valid_ppl": "1.74", "valid_accuracy": "87.835", "valid_wer": "12.161", "valid_wps": "172.7", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "52500", "valid_best_accuracy": "87.835"} [2024-06-28 05:58:37,233][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 52500 updates [2024-06-28 05:58:37,234][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_52500.pt [2024-06-28 05:58:40,559][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_52500.pt [2024-06-28 05:58:45,965][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_52500.pt (epoch 2 @ 52500 updates, score 87.835) (writing took 8.732093732105568 seconds) [2024-06-28 06:01:47,950][train_inner][INFO] - {"epoch": 2, "update": 1.744, "loss": "0.866", "ntokens": "128.16", "acc_total": "128.16", "n_correct": "111.96", "wer_total": "128.16", "n_error": "16.195", "ppl": "1.82", "accuracy": "87.36", "wer": "12.637", "wps": "8.9", "ups": "0.07", "wpb": "128.2", "bsz": "8", "num_updates": "52600", "lr": "4.35142e-05", "gnorm": "2.921", "loss_scale": "2048", "train_wall": "363", "gb_free": "6.5", "wall": "148835"} [2024-06-28 06:07:52,855][train_inner][INFO] - {"epoch": 2, "update": 1.751, "loss": "0.855", "ntokens": "126.185", "acc_total": "126.185", "n_correct": "110.425", "wer_total": "126.185", "n_error": "15.76", "ppl": "1.81", "accuracy": "87.51", "wer": "12.49", "wps": "69.2", "ups": "0.55", "wpb": "126.2", "bsz": "8", "num_updates": "52800", "lr": "4.28672e-05", "gnorm": "2.83", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "149200"} [2024-06-28 06:13:57,080][train_inner][INFO] - {"epoch": 2, "update": 1.757, "loss": "0.889", "ntokens": "126.955", "acc_total": "126.955", "n_correct": "110.305", "wer_total": "126.955", "n_error": "16.645", "ppl": "1.85", "accuracy": "86.885", "wer": "13.111", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "53000", "lr": "4.22299e-05", "gnorm": "2.878", "loss_scale": "2048", "train_wall": "363", "gb_free": "6.5", "wall": "149564"} [2024-06-28 06:20:00,619][train_inner][INFO] - {"epoch": 2, "update": 1.764, "loss": "0.89", "ntokens": "127.24", "acc_total": "127.24", "n_correct": "110.51", "wer_total": "127.24", "n_error": "16.73", "ppl": "1.85", "accuracy": "86.852", "wer": "13.148", "wps": "70", "ups": "0.55", "wpb": "127.2", "bsz": "8", "num_updates": "53200", "lr": "4.16021e-05", "gnorm": "2.841", "loss_scale": "4096", "train_wall": "363", "gb_free": "6.5", "wall": "149927"} [2024-06-28 06:26:04,133][train_inner][INFO] - {"epoch": 2, "update": 1.771, "loss": "0.883", "ntokens": "126.09", "acc_total": "126.09", "n_correct": "109.905", "wer_total": "126.09", "n_error": "16.16", "ppl": "1.84", "accuracy": "87.164", "wer": "12.816", "wps": "69.4", "ups": "0.55", "wpb": "126.1", "bsz": "8", "num_updates": "53400", "lr": "4.09836e-05", "gnorm": "2.906", "loss_scale": "4096", "train_wall": "363", "gb_free": "6.5", "wall": "150291"} [2024-06-28 06:32:07,737][train_inner][INFO] - {"epoch": 2, "update": 1.777, "loss": "0.864", "ntokens": "127.32", "acc_total": "127.32", "n_correct": "110.99", "wer_total": "127.32", "n_error": "16.33", "ppl": "1.82", "accuracy": "87.174", "wer": "12.826", "wps": "70", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "53600", "lr": "4.03743e-05", "gnorm": "2.774", "loss_scale": "4096", "train_wall": "363", "gb_free": "6.5", "wall": "150655"} [2024-06-28 06:38:11,091][train_inner][INFO] - {"epoch": 2, "update": 1.784, "loss": "0.886", "ntokens": "127.33", "acc_total": "127.33", "n_correct": "110.52", "wer_total": "127.33", "n_error": "16.81", "ppl": "1.85", "accuracy": "86.798", "wer": "13.202", "wps": "70.1", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "53800", "lr": "3.9774e-05", "gnorm": "2.847", "loss_scale": "4096", "train_wall": "363", "gb_free": "6.5", "wall": "151018"} [2024-06-28 06:44:15,265][train_inner][INFO] - {"epoch": 2, "update": 1.79, "loss": "0.883", "ntokens": "126.29", "acc_total": "126.29", "n_correct": "109.665", "wer_total": "126.29", "n_error": "16.62", "ppl": "1.84", "accuracy": "86.836", "wer": "13.16", "wps": "69.4", "ups": "0.55", "wpb": "126.3", "bsz": "8", "num_updates": "54000", "lr": "3.91827e-05", "gnorm": "2.867", "loss_scale": "4096", "train_wall": "363", "gb_free": "6.5", "wall": "151382"} [2024-06-28 06:50:19,608][train_inner][INFO] - {"epoch": 2, "update": 1.797, "loss": "0.891", "ntokens": "126.955", "acc_total": "126.955", "n_correct": "110.13", "wer_total": "126.955", "n_error": "16.815", "ppl": "1.85", "accuracy": "86.747", "wer": "13.245", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "54200", "lr": "3.86002e-05", "gnorm": "2.822", "loss_scale": "4096", "train_wall": "364", "gb_free": "6.5", "wall": "151746"} [2024-06-28 06:51:34,254][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0 [2024-06-28 06:55:43,451][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-28 06:56:27,201][train_inner][INFO] - {"epoch": 2, "update": 1.804, "loss": "0.851", "ntokens": "127.23", "acc_total": "127.23", "n_correct": "111.03", "wer_total": "127.23", "n_error": "16.2", "ppl": "1.8", "accuracy": "87.267", "wer": "12.733", "wps": "69.2", "ups": "0.54", "wpb": "127.2", "bsz": "8", "num_updates": "54400", "lr": "3.80263e-05", "gnorm": "2.861", "loss_scale": "1024", "train_wall": "367", "gb_free": "6.5", "wall": "152114"} [2024-06-28 07:02:31,418][train_inner][INFO] - {"epoch": 2, "update": 1.81, "loss": "0.924", "ntokens": "126.685", "acc_total": "126.685", "n_correct": "109.79", "wer_total": "126.685", "n_error": "16.89", "ppl": "1.9", "accuracy": "86.664", "wer": "13.332", "wps": "69.6", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "54600", "lr": "3.7461e-05", "gnorm": "3.067", "loss_scale": "1024", "train_wall": "363", "gb_free": "6.5", "wall": "152478"} [2024-06-28 07:08:35,163][train_inner][INFO] - {"epoch": 2, "update": 1.817, "loss": "0.91", "ntokens": "125.765", "acc_total": "125.765", "n_correct": "109.08", "wer_total": "125.765", "n_error": "16.68", "ppl": "1.88", "accuracy": "86.733", "wer": "13.263", "wps": "69.2", "ups": "0.55", "wpb": "125.8", "bsz": "8", "num_updates": "54800", "lr": "3.6904e-05", "gnorm": "2.887", "loss_scale": "1024", "train_wall": "363", "gb_free": "6.5", "wall": "152842"} [2024-06-28 07:14:38,768][train_inner][INFO] - {"epoch": 2, "update": 1.824, "loss": "0.863", "ntokens": "126.365", "acc_total": "126.365", "n_correct": "109.955", "wer_total": "126.365", "n_error": "16.41", "ppl": "1.82", "accuracy": "87.014", "wer": "12.986", "wps": "69.5", "ups": "0.55", "wpb": "126.4", "bsz": "8", "num_updates": "55000", "lr": "3.63554e-05", "gnorm": "2.861", "loss_scale": "1024", "train_wall": "363", "gb_free": "6.5", "wall": "153206"} [2024-06-28 07:14:38,769][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-28 07:56:34,064][valid][INFO] - {"epoch": 2, "valid_loss": "0.782", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "15.9666", "valid_wer_total": "18.1585", "valid_n_error": "2.1915", "valid_ppl": "1.72", "valid_accuracy": "87.929", "valid_wer": "12.069", "valid_wps": "173.2", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "55000", "valid_best_accuracy": "87.929"} [2024-06-28 07:56:34,064][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 55000 updates [2024-06-28 07:56:34,065][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_55000.pt [2024-06-28 07:56:37,383][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_55000.pt [2024-06-28 07:56:42,695][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_55000.pt (epoch 2 @ 55000 updates, score 87.929) (writing took 8.630251178983599 seconds) [2024-06-28 08:02:47,068][train_inner][INFO] - {"epoch": 2, "update": 1.83, "loss": "0.888", "ntokens": "126.87", "acc_total": "126.87", "n_correct": "110.195", "wer_total": "126.87", "n_error": "16.67", "ppl": "1.85", "accuracy": "86.857", "wer": "13.139", "wps": "8.8", "ups": "0.07", "wpb": "126.9", "bsz": "8", "num_updates": "55200", "lr": "3.58149e-05", "gnorm": "2.874", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "156094"} [2024-06-28 08:08:51,540][train_inner][INFO] - {"epoch": 2, "update": 1.837, "loss": "0.89", "ntokens": "126.795", "acc_total": "126.795", "n_correct": "110.335", "wer_total": "126.795", "n_error": "16.46", "ppl": "1.85", "accuracy": "87.018", "wer": "12.982", "wps": "69.6", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "55400", "lr": "3.52824e-05", "gnorm": "2.879", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "156458"} [2024-06-28 08:14:56,171][train_inner][INFO] - {"epoch": 2, "update": 1.844, "loss": "0.904", "ntokens": "126.725", "acc_total": "126.725", "n_correct": "110.165", "wer_total": "126.725", "n_error": "16.555", "ppl": "1.87", "accuracy": "86.932", "wer": "13.064", "wps": "69.5", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "55600", "lr": "3.47579e-05", "gnorm": "2.941", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "156823"} [2024-06-28 08:21:01,686][train_inner][INFO] - {"epoch": 2, "update": 1.85, "loss": "0.877", "ntokens": "127.425", "acc_total": "127.425", "n_correct": "111.005", "wer_total": "127.425", "n_error": "16.415", "ppl": "1.84", "accuracy": "87.114", "wer": "12.882", "wps": "69.7", "ups": "0.55", "wpb": "127.4", "bsz": "8", "num_updates": "55800", "lr": "3.42411e-05", "gnorm": "2.972", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "157188"} [2024-06-28 08:27:06,167][train_inner][INFO] - {"epoch": 2, "update": 1.857, "loss": "0.888", "ntokens": "127.415", "acc_total": "127.415", "n_correct": "110.585", "wer_total": "127.415", "n_error": "16.83", "ppl": "1.85", "accuracy": "86.791", "wer": "13.209", "wps": "69.9", "ups": "0.55", "wpb": "127.4", "bsz": "8", "num_updates": "56000", "lr": "3.37321e-05", "gnorm": "2.904", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "157553"} [2024-06-28 08:33:10,796][train_inner][INFO] - {"epoch": 2, "update": 1.863, "loss": "0.887", "ntokens": "127", "acc_total": "127", "n_correct": "110.655", "wer_total": "127", "n_error": "16.335", "ppl": "1.85", "accuracy": "87.13", "wer": "12.862", "wps": "69.7", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "56200", "lr": "3.32306e-05", "gnorm": "2.83", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "157918"} [2024-06-28 08:39:15,373][train_inner][INFO] - {"epoch": 2, "update": 1.87, "loss": "0.846", "ntokens": "127.305", "acc_total": "127.305", "n_correct": "111.225", "wer_total": "127.305", "n_error": "16.07", "ppl": "1.8", "accuracy": "87.369", "wer": "12.623", "wps": "69.8", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "56400", "lr": "3.27365e-05", "gnorm": "2.869", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "158282"} [2024-06-28 08:45:20,000][train_inner][INFO] - {"epoch": 2, "update": 1.877, "loss": "0.873", "ntokens": "126.595", "acc_total": "126.595", "n_correct": "110.36", "wer_total": "126.595", "n_error": "16.235", "ppl": "1.83", "accuracy": "87.176", "wer": "12.824", "wps": "69.4", "ups": "0.55", "wpb": "126.6", "bsz": "8", "num_updates": "56600", "lr": "3.22498e-05", "gnorm": "3.055", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "158647"} [2024-06-28 08:51:24,498][train_inner][INFO] - {"epoch": 2, "update": 1.883, "loss": "0.898", "ntokens": "127.31", "acc_total": "127.31", "n_correct": "110.675", "wer_total": "127.31", "n_error": "16.635", "ppl": "1.86", "accuracy": "86.933", "wer": "13.067", "wps": "69.9", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "56800", "lr": "3.17704e-05", "gnorm": "2.886", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "159011"} [2024-06-28 08:57:29,391][train_inner][INFO] - {"epoch": 2, "update": 1.89, "loss": "0.894", "ntokens": "126.765", "acc_total": "126.765", "n_correct": "109.95", "wer_total": "126.765", "n_error": "16.81", "ppl": "1.86", "accuracy": "86.735", "wer": "13.261", "wps": "69.5", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "57000", "lr": "3.1298e-05", "gnorm": "2.906", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "159376"} [2024-06-28 08:59:33,331][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0 [2024-06-28 09:03:35,950][train_inner][INFO] - {"epoch": 2, "update": 1.897, "loss": "0.854", "ntokens": "126.14", "acc_total": "126.14", "n_correct": "110.365", "wer_total": "126.14", "n_error": "15.77", "ppl": "1.81", "accuracy": "87.494", "wer": "12.502", "wps": "68.8", "ups": "0.55", "wpb": "126.1", "bsz": "8", "num_updates": "57200", "lr": "3.08327e-05", "gnorm": "2.897", "loss_scale": "1024", "train_wall": "366", "gb_free": "6.5", "wall": "159743"} [2024-06-28 09:09:40,981][train_inner][INFO] - {"epoch": 2, "update": 1.903, "loss": "0.858", "ntokens": "126.83", "acc_total": "126.83", "n_correct": "110.185", "wer_total": "126.83", "n_error": "16.645", "ppl": "1.81", "accuracy": "86.876", "wer": "13.124", "wps": "69.5", "ups": "0.55", "wpb": "126.8", "bsz": "8", "num_updates": "57400", "lr": "3.03743e-05", "gnorm": "2.84", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "160108"} [2024-06-28 09:12:43,411][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-28 09:54:42,784][valid][INFO] - {"epoch": 2, "valid_loss": "0.771", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "16.0008", "valid_wer_total": "18.1585", "valid_n_error": "2.15711", "valid_ppl": "1.71", "valid_accuracy": "88.117", "valid_wer": "11.879", "valid_wps": "172.9", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "57500", "valid_best_accuracy": "88.117"} [2024-06-28 09:54:42,785][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 57500 updates [2024-06-28 09:54:42,785][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_57500.pt [2024-06-28 09:54:46,090][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_57500.pt [2024-06-28 09:54:51,483][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_57500.pt (epoch 2 @ 57500 updates, score 88.117) (writing took 8.698047182057053 seconds) [2024-06-28 09:57:53,630][train_inner][INFO] - {"epoch": 2, "update": 1.91, "loss": "0.845", "ntokens": "127.14", "acc_total": "127.14", "n_correct": "111", "wer_total": "127.14", "n_error": "16.135", "ppl": "1.8", "accuracy": "87.305", "wer": "12.691", "wps": "8.8", "ups": "0.07", "wpb": "127.1", "bsz": "8", "num_updates": "57600", "lr": "2.99228e-05", "gnorm": "2.794", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "163000"} [2024-06-28 10:03:58,821][train_inner][INFO] - {"epoch": 2, "update": 1.917, "loss": "0.878", "ntokens": "127.84", "acc_total": "127.84", "n_correct": "110.985", "wer_total": "127.84", "n_error": "16.855", "ppl": "1.84", "accuracy": "86.816", "wer": "13.184", "wps": "70", "ups": "0.55", "wpb": "127.8", "bsz": "8", "num_updates": "57800", "lr": "2.94779e-05", "gnorm": "2.899", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "163366"} [2024-06-28 10:10:04,141][train_inner][INFO] - {"epoch": 2, "update": 1.923, "loss": "0.862", "ntokens": "127.16", "acc_total": "127.16", "n_correct": "111.08", "wer_total": "127.16", "n_error": "16.075", "ppl": "1.82", "accuracy": "87.355", "wer": "12.642", "wps": "69.6", "ups": "0.55", "wpb": "127.2", "bsz": "8", "num_updates": "58000", "lr": "2.90397e-05", "gnorm": "2.814", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "163731"} [2024-06-28 10:16:09,157][train_inner][INFO] - {"epoch": 2, "update": 1.93, "loss": "0.872", "ntokens": "126.485", "acc_total": "126.485", "n_correct": "109.985", "wer_total": "126.485", "n_error": "16.5", "ppl": "1.83", "accuracy": "86.955", "wer": "13.045", "wps": "69.3", "ups": "0.55", "wpb": "126.5", "bsz": "8", "num_updates": "58200", "lr": "2.86079e-05", "gnorm": "2.848", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "164096"} [2024-06-28 10:22:14,453][train_inner][INFO] - {"epoch": 2, "update": 1.936, "loss": "0.835", "ntokens": "127.135", "acc_total": "127.135", "n_correct": "111.67", "wer_total": "127.135", "n_error": "15.46", "ppl": "1.78", "accuracy": "87.836", "wer": "12.16", "wps": "69.6", "ups": "0.55", "wpb": "127.1", "bsz": "8", "num_updates": "58400", "lr": "2.81826e-05", "gnorm": "2.859", "loss_scale": "1024", "train_wall": "365", "gb_free": "6.5", "wall": "164461"} [2024-06-28 10:28:19,438][train_inner][INFO] - {"epoch": 2, "update": 1.943, "loss": "0.857", "ntokens": "127.205", "acc_total": "127.205", "n_correct": "110.94", "wer_total": "127.205", "n_error": "16.26", "ppl": "1.81", "accuracy": "87.214", "wer": "12.783", "wps": "69.7", "ups": "0.55", "wpb": "127.2", "bsz": "8", "num_updates": "58600", "lr": "2.77636e-05", "gnorm": "2.84", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "164826"} [2024-06-28 10:34:24,459][train_inner][INFO] - {"epoch": 2, "update": 1.95, "loss": "0.873", "ntokens": "126.7", "acc_total": "126.7", "n_correct": "110.185", "wer_total": "126.7", "n_error": "16.515", "ppl": "1.83", "accuracy": "86.965", "wer": "13.035", "wps": "69.4", "ups": "0.55", "wpb": "126.7", "bsz": "8", "num_updates": "58800", "lr": "2.73509e-05", "gnorm": "2.889", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "165191"} [2024-06-28 10:40:29,525][train_inner][INFO] - {"epoch": 2, "update": 1.956, "loss": "0.878", "ntokens": "128.2", "acc_total": "128.2", "n_correct": "111.525", "wer_total": "128.2", "n_error": "16.675", "ppl": "1.84", "accuracy": "86.993", "wer": "13.007", "wps": "70.2", "ups": "0.55", "wpb": "128.2", "bsz": "8", "num_updates": "59000", "lr": "2.69442e-05", "gnorm": "2.762", "loss_scale": "1024", "train_wall": "364", "gb_free": "6.5", "wall": "165556"} [2024-06-28 10:46:34,848][train_inner][INFO] - {"epoch": 2, "update": 1.963, "loss": "0.859", "ntokens": "127.465", "acc_total": "127.465", "n_correct": "111.325", "wer_total": "127.465", "n_error": "16.13", "ppl": "1.81", "accuracy": "87.338", "wer": "12.654", "wps": "69.8", "ups": "0.55", "wpb": "127.5", "bsz": "8", "num_updates": "59200", "lr": "2.65436e-05", "gnorm": "2.792", "loss_scale": "2048", "train_wall": "365", "gb_free": "6.5", "wall": "165922"} [2024-06-28 10:52:39,987][train_inner][INFO] - {"epoch": 2, "update": 1.97, "loss": "0.881", "ntokens": "127.035", "acc_total": "127.035", "n_correct": "110.61", "wer_total": "127.035", "n_error": "16.415", "ppl": "1.84", "accuracy": "87.07", "wer": "12.922", "wps": "69.6", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "59400", "lr": "2.6149e-05", "gnorm": "2.851", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "166287"} [2024-06-28 10:58:45,084][train_inner][INFO] - {"epoch": 2, "update": 1.976, "loss": "0.871", "ntokens": "127.345", "acc_total": "127.345", "n_correct": "110.915", "wer_total": "127.345", "n_error": "16.43", "ppl": "1.83", "accuracy": "87.098", "wer": "12.902", "wps": "69.8", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "59600", "lr": "2.57603e-05", "gnorm": "2.823", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "166652"} [2024-06-28 11:04:50,244][train_inner][INFO] - {"epoch": 2, "update": 1.983, "loss": "0.857", "ntokens": "127.045", "acc_total": "127.045", "n_correct": "110.76", "wer_total": "127.045", "n_error": "16.28", "ppl": "1.81", "accuracy": "87.182", "wer": "12.814", "wps": "69.6", "ups": "0.55", "wpb": "127", "bsz": "8", "num_updates": "59800", "lr": "2.53773e-05", "gnorm": "2.95", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "167017"} [2024-06-28 11:10:55,131][train_inner][INFO] - {"epoch": 2, "update": 1.989, "loss": "0.886", "ntokens": "127.305", "acc_total": "127.305", "n_correct": "110.515", "wer_total": "127.305", "n_error": "16.79", "ppl": "1.85", "accuracy": "86.811", "wer": "13.189", "wps": "69.8", "ups": "0.55", "wpb": "127.3", "bsz": "8", "num_updates": "60000", "lr": "2.5e-05", "gnorm": "2.913", "loss_scale": "2048", "train_wall": "364", "gb_free": "6.5", "wall": "167382"} [2024-06-28 11:10:55,131][fairseq_cli.train][INFO] - Stopping training due to num_updates: 60000 >= max_update: 60000 [2024-06-28 11:10:55,132][fairseq_cli.train][INFO] - begin validation on "valid" subset [2024-06-28 11:52:55,696][valid][INFO] - {"epoch": 2, "valid_loss": "0.765", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "16.0075", "valid_wer_total": "18.1585", "valid_n_error": "2.15052", "valid_ppl": "1.7", "valid_accuracy": "88.154", "valid_wer": "11.843", "valid_wps": "172.8", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "60000", "valid_best_accuracy": "88.154"} [2024-06-28 11:52:55,697][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 60000 updates [2024-06-28 11:52:55,697][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_60000.pt [2024-06-28 11:52:58,991][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_60000.pt [2024-06-28 11:53:04,348][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_60000.pt (epoch 2 @ 60000 updates, score 88.154) (writing took 8.651636151829734 seconds) [2024-06-28 11:53:04,396][fairseq_cli.train][INFO] - end of epoch 2 (average epoch stats below) [2024-06-28 11:53:04,399][train][INFO] - {"epoch": 2, "train_loss": "0.979", "train_ntokens": "126.915", "train_acc_total": "126.915", "train_n_correct": "108.761", "train_wer_total": "126.915", "train_n_error": "18.1492", "train_ppl": "1.97", "train_accuracy": "85.696", "train_wer": "14.3", "train_wps": "44.7", "train_ups": "0.35", "train_wpb": "126.9", "train_bsz": "8", "train_num_updates": "60000", "train_lr": "2.5e-05", "train_gnorm": "3.12", "train_loss_scale": "2048", "train_train_wall": "54315", "train_gb_free": "6.5", "train_wall": "169911"} [2024-06-28 11:53:04,400][fairseq_cli.train][INFO] - done training in 169910.1 seconds