ViAVSP-LLM_v1.4.1 / hydra_train.log

Upload 10 files

85e0489 verified 7 months ago

229 kB

	[2024-06-28 14:51:38,090][fairseq_cli.train][INFO] - {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 200, 'log_format': 'json', 'log_file': None, 'tensorboard_logdir': 'tblog', 'wandb_project': 'AVSP-LLM', 'azureml_logging': False, 'seed': 1337, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/home/theodore/Projects/VSP-LLM/src', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'no_c10d', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': True, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_algorithm': 'LocalSGD', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': None, 'batch_size': 1, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': None, 'batch_size_valid': 1, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 30000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': True, 'update_freq': [8], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False}, 'checkpoint': {'_name': None, 'save_dir': 'checkpoints', 'restore_file': 'checkpoint_last.pt', 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 2500, 'keep_interval_updates': 1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'accuracy', 'maximize_best_checkpoint_metric': True, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': {'_name': 'vsp_llm', 'w2v_path': '/home/theodore/Projects/VSP-LLM/checkpoints/large_vox_iter5.pt', 'llm_ckpt_path': 'vilm/vinallama-2.7b', 'apply_mask': False, 'mask_selection': 'static', 'mask_length': 10, 'mask_other': 0, 'mask_prob': 0.75, 'mask_channel_selection': 'static', 'mask_channel_length': 64, 'mask_channel_other': 0, 'mask_channel_prob': 0.5, 'layerdrop': 0.1, 'dropout': 0.0, 'activation_dropout': 0.1, 'attention_dropout': 0.0, 'feature_grad_mult': 1.0, 'encoder_embed_dim': 1024, 'decoder_embed_dim': 4096, 'freeze_finetune_updates': 40000}, 'task': {'_name': 'vsp_llm_training', 'is_s2s': True, 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/base', 'label_dir': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/base', 'normalize': True, 'labels': ['wrd'], 'single_target': True, 'fine_tuning': True, 'stack_order_audio': 4, 'max_sample_size': 500, 'modalities': ['video', 'audio'], 'image_aug': True, 'pad_audio': True, 'random_crop': False, 'llm_ckpt_path': 'vilm/vinallama-2.7b'}, 'criterion': {'_name': 'decoder_only_language_modeling_loss', 'report_accuracy': True, 'label_smoothing': 0.1}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9,0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'tri_stage', 'warmup_steps': 10000, 'hold_steps': 0, 'decay_steps': 20000, 'phase_ratio': None, 'init_lr_scale': 0.01, 'final_lr_scale': 0.05, 'max_update': 30000, 'lr': [0.0005]}, 'scoring': None, 'bpe': None, 'tokenizer': None, 'job_logging_cfg': {'version': 1, 'formatters': {'simple': {'format': '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'}}, 'handlers': {'console': {'class': 'logging.StreamHandler', 'formatter': 'simple', 'stream': 'ext://sys.stdout'}, 'file': {'class': 'logging.FileHandler', 'formatter': 'simple', 'filename': 'hydra_train.log'}}, 'root': {'level': 'INFO', 'handlers': ['console', 'file']}, 'disable_existing_loggers': False}}
	[2024-06-28 14:51:38,093][src.vsp_llm_training][INFO] - current directory is /home/theodore/Projects/VSP-LLM/experiments/ViAVSP-LLM_v1.4.1
	[2024-06-28 14:51:38,093][src.vsp_llm_training][INFO] - AVHubertPretrainingTask Config {'_name': 'vsp_llm_training', 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/base', 'labels': ['wrd'], 'label_dir': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/base', 'label_rate': -1, 'sample_rate': 16000, 'llm_ckpt_path': 'vilm/vinallama-2.7b', 'normalize': True, 'enable_padding': False, 'max_sample_size': 500, 'min_sample_size': None, 'max_trim_sample_size': '${task.max_sample_size}', 'single_target': True, 'random_crop': False, 'pad_audio': True, 'pdb': False, 'stack_order_audio': 4, 'skip_verify': False, 'image_aug': True, 'image_crop_size': 88, 'image_mean': 0.421, 'image_std': 0.165, 'modalities': ['video', 'audio'], 'is_s2s': True, 'tokenizer_bpe_name': None, 'tokenizer_bpe_model': None, 'noise_wav': None, 'noise_prob': 0.0, 'noise_snr': '0', 'noise_num': 1, 'fine_tuning': True}
	[2024-06-28 14:51:40,064][src.hubert_pretraining][INFO] - current directory is /home/theodore/Projects/VSP-LLM/experiments/ViAVSP-LLM_v1.4.1
	[2024-06-28 14:51:40,064][src.hubert_pretraining][INFO] - AVHubertPretrainingTask Config {'_name': 'av_hubert_pretraining', 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/base', 'labels': ['km'], 'label_dir': '/checkpoint/bshi/data/lrs3//video/hubert/stitch-iters/envox-iter4-l12c2000/', 'label_rate': 25, 'sample_rate': 25, 'normalize': True, 'enable_padding': False, 'max_sample_size': 2000, 'min_sample_size': 5, 'max_trim_sample_size': 400, 'single_target': False, 'random_crop': True, 'pad_audio': False, 'pdb': False, 'stack_order_audio': 4, 'skip_verify': False, 'image_aug': True, 'image_crop_size': 88, 'image_mean': 0.421, 'image_std': 0.165, 'modalities': ['audio', 'video'], 'is_s2s': False, 'tokenizer_bpe_name': None, 'tokenizer_bpe_model': None, 'noise_wav': None, 'noise_prob': 0.0, 'noise_snr': '0', 'noise_num': 1, 'fine_tuning': False}
	[2024-06-28 14:51:40,068][src.hubert][INFO] - HubertModel Config: {'_name': 'av_hubert', 'label_rate': 25, 'input_modality': '${task.input_modality}', 'extractor_mode': default, 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': gelu, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.1, 'encoder_layerdrop': 0.1, 'dropout_input': 0.0, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 1.0, 'mask_length_audio': 10, 'mask_prob_audio': 0.8, 'mask_length_image': 5, 'mask_prob_image': 0.3, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'resnet_relu_type': 'prelu', 'resnet_weights': None, 'sim_type': 'cosine', 'sub_encoder_layers': 0, 'audio_feat_dim': 104, 'modality_dropout': 0.5, 'audio_dropout': 0.5, 'modality_fuse': 'concat', 'selection_type': 'same_seq', 'masking_type': 'input', 'decoder_embed_dim': 768, 'decoder_ffn_embed_dim': 3072, 'decoder_layers': 6, 'decoder_layerdrop': 0.0, 'decoder_attention_heads': 4, 'decoder_learned_pos': False, 'decoder_normalize_before': False, 'no_token_positional_embeddings': False, 'decoder_dropout': 0.1, 'decoder_attention_dropout': 0.1, 'decoder_activation_dropout': 0.0, 'max_target_positions': 2048, 'share_decoder_input_output_embed': False, 'no_scale_embedding': True}
	[2024-06-28 14:51:47,058][fairseq_cli.train][INFO] - avhubert_llm_seq2seq_cluster_count(
	(encoder): HubertEncoderWrapper(
	(w2v_model): AVHubertModel(
	(feature_extractor_audio): SubModel(
	(proj): Linear(in_features=104, out_features=1024, bias=True)
	)
	(feature_extractor_video): SubModel(
	(resnet): ResEncoder(
	(frontend3D): Sequential(
	(0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
	(1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(2): PReLU(num_parameters=64)
	(3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
	)
	(trunk): ResNet(
	(layer1): Sequential(
	(0): BasicBlock(
	(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=64)
	(relu2): PReLU(num_parameters=64)
	(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	(1): BasicBlock(
	(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=64)
	(relu2): PReLU(num_parameters=64)
	(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(layer2): Sequential(
	(0): BasicBlock(
	(conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=128)
	(relu2): PReLU(num_parameters=128)
	(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(downsample): Sequential(
	(0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(1): BasicBlock(
	(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=128)
	(relu2): PReLU(num_parameters=128)
	(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(layer3): Sequential(
	(0): BasicBlock(
	(conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=256)
	(relu2): PReLU(num_parameters=256)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(downsample): Sequential(
	(0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(1): BasicBlock(
	(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=256)
	(relu2): PReLU(num_parameters=256)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(layer4): Sequential(
	(0): BasicBlock(
	(conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=512)
	(relu2): PReLU(num_parameters=512)
	(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(downsample): Sequential(
	(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(1): BasicBlock(
	(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=512)
	(relu2): PReLU(num_parameters=512)
	(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(avgpool): AdaptiveAvgPool2d(output_size=1)
	)
	)
	(proj): Linear(in_features=512, out_features=1024, bias=True)
	)
	(post_extract_proj): Linear(in_features=2048, out_features=1024, bias=True)
	(dropout_input): Dropout(p=0.0, inplace=False)
	(dropout_features): Dropout(p=0.1, inplace=False)
	(encoder): TransformerEncoder(
	(pos_conv): Sequential(
	(0): Conv1d(1024, 1024, kernel_size=(128,), stride=(1,), padding=(64,), groups=16)
	(1): SamePad()
	(2): GELU(approximate='none')
	)
	(layers): ModuleList(
	(0-23): 24 x TransformerSentenceEncoderLayer(
	(self_attn): MultiheadAttention(
	(dropout_module): FairseqDropout()
	(k_proj): Linear(in_features=1024, out_features=1024, bias=True)
	(v_proj): Linear(in_features=1024, out_features=1024, bias=True)
	(q_proj): Linear(in_features=1024, out_features=1024, bias=True)
	(out_proj): Linear(in_features=1024, out_features=1024, bias=True)
	)
	(dropout1): Dropout(p=0.0, inplace=False)
	(dropout2): Dropout(p=0.1, inplace=False)
	(dropout3): Dropout(p=0.0, inplace=False)
	(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
	(fc1): Linear(in_features=1024, out_features=4096, bias=True)
	(fc2): Linear(in_features=4096, out_features=1024, bias=True)
	(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
	)
	)
	(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
	)
	(layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
	(final_proj): None
	)
	)
	(decoder): PeftModelForCausalLM(
	(base_model): LoraModel(
	(model): LlamaForCausalLM(
	(model): LlamaModel(
	(embed_tokens): Embedding(46304, 2560, padding_idx=0)
	(layers): ModuleList(
	(0-31): 32 x LlamaDecoderLayer(
	(self_attn): LlamaSdpaAttention(
	(q_proj): lora.Linear4bit(
	(base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False)
	(lora_dropout): ModuleDict(
	(default): Dropout(p=0.05, inplace=False)
	)
	(lora_A): ModuleDict(
	(default): Linear(in_features=2560, out_features=16, bias=False)
	)
	(lora_B): ModuleDict(
	(default): Linear(in_features=16, out_features=2560, bias=False)
	)
	(lora_embedding_A): ParameterDict()
	(lora_embedding_B): ParameterDict()
	)
	(k_proj): lora.Linear4bit(
	(base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False)
	(lora_dropout): ModuleDict(
	(default): Dropout(p=0.05, inplace=False)
	)
	(lora_A): ModuleDict(
	(default): Linear(in_features=2560, out_features=16, bias=False)
	)
	(lora_B): ModuleDict(
	(default): Linear(in_features=16, out_features=2560, bias=False)
	)
	(lora_embedding_A): ParameterDict()
	(lora_embedding_B): ParameterDict()
	)
	(v_proj): lora.Linear4bit(
	(base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False)
	(lora_dropout): ModuleDict(
	(default): Dropout(p=0.05, inplace=False)
	)
	(lora_A): ModuleDict(
	(default): Linear(in_features=2560, out_features=16, bias=False)
	)
	(lora_B): ModuleDict(
	(default): Linear(in_features=16, out_features=2560, bias=False)
	)
	(lora_embedding_A): ParameterDict()
	(lora_embedding_B): ParameterDict()
	)
	(o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
	(rotary_emb): LlamaRotaryEmbedding()
	)
	(mlp): LlamaMLP(
	(gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
	(up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
	(down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False)
	(act_fn): SiLU()
	)
	(input_layernorm): LlamaRMSNorm()
	(post_attention_layernorm): LlamaRMSNorm()
	)
	)
	(norm): LlamaRMSNorm()
	)
	(lm_head): Linear(in_features=2560, out_features=46304, bias=False)
	)
	)
	)
	(avfeat_to_llm): Linear(in_features=1024, out_features=2560, bias=True)
	)
	[2024-06-28 14:51:47,063][fairseq_cli.train][INFO] - task: VSP_LLM_TrainingTask
	[2024-06-28 14:51:47,063][fairseq_cli.train][INFO] - model: avhubert_llm_seq2seq_cluster_count
	[2024-06-28 14:51:47,063][fairseq_cli.train][INFO] - criterion: decoder_only_language_modeling_loss
	[2024-06-28 14:51:47,067][fairseq_cli.train][INFO] - num. shared model params: 1,841,644,264 (num. trained: 335,624,424)
	[2024-06-28 14:51:47,069][fairseq_cli.train][INFO] - num. expert model params: 0 (num. trained: 0)
	[2024-06-28 15:04:24,843][fairseq_cli.train][INFO] - {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 200, 'log_format': 'json', 'log_file': None, 'tensorboard_logdir': 'tblog', 'wandb_project': 'AVSP-LLM', 'azureml_logging': False, 'seed': 1337, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/home/theodore/Projects/VSP-LLM/src', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_num_procs': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'no_c10d', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': True, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_algorithm': 'LocalSGD', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': None, 'batch_size': 1, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': None, 'batch_size_valid': 1, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 30000, 'stop_time_hours': 0.0, 'clip_norm': 0.0, 'sentence_avg': True, 'update_freq': [8], 'lr': [0.0005], 'stop_min_lr': -1.0, 'use_bmuf': False}, 'checkpoint': {'_name': None, 'save_dir': 'checkpoints', 'restore_file': 'checkpoint_last.pt', 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 2500, 'keep_interval_updates': 1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'accuracy', 'maximize_best_checkpoint_metric': True, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': {'_name': 'vsp_llm', 'w2v_path': '/home/theodore/Projects/VSP-LLM/checkpoints/large_vox_iter5.pt', 'llm_ckpt_path': 'vilm/vinallama-2.7b', 'apply_mask': False, 'mask_selection': 'static', 'mask_length': 10, 'mask_other': 0, 'mask_prob': 0.75, 'mask_channel_selection': 'static', 'mask_channel_length': 64, 'mask_channel_other': 0, 'mask_channel_prob': 0.5, 'layerdrop': 0.1, 'dropout': 0.0, 'activation_dropout': 0.1, 'attention_dropout': 0.0, 'feature_grad_mult': 1.0, 'encoder_embed_dim': 1024, 'decoder_embed_dim': 4096, 'freeze_finetune_updates': 40000}, 'task': {'_name': 'vsp_llm_training', 'is_s2s': True, 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/100h/base', 'label_dir': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/100h/base', 'normalize': True, 'labels': ['wrd'], 'single_target': True, 'fine_tuning': True, 'stack_order_audio': 4, 'max_sample_size': 500, 'modalities': ['video', 'audio'], 'image_aug': True, 'pad_audio': True, 'random_crop': False, 'llm_ckpt_path': 'vilm/vinallama-2.7b'}, 'criterion': {'_name': 'decoder_only_language_modeling_loss', 'report_accuracy': True, 'label_smoothing': 0.1}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9,0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'tpu': False, 'lr': [0.0005]}, 'lr_scheduler': {'_name': 'tri_stage', 'warmup_steps': 10000, 'hold_steps': 0, 'decay_steps': 20000, 'phase_ratio': None, 'init_lr_scale': 0.01, 'final_lr_scale': 0.05, 'max_update': 30000, 'lr': [0.0005]}, 'scoring': None, 'bpe': None, 'tokenizer': None, 'job_logging_cfg': {'version': 1, 'formatters': {'simple': {'format': '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'}}, 'handlers': {'console': {'class': 'logging.StreamHandler', 'formatter': 'simple', 'stream': 'ext://sys.stdout'}, 'file': {'class': 'logging.FileHandler', 'formatter': 'simple', 'filename': 'hydra_train.log'}}, 'root': {'level': 'INFO', 'handlers': ['console', 'file']}, 'disable_existing_loggers': False}}
	[2024-06-28 15:04:24,845][src.vsp_llm_training][INFO] - current directory is /home/theodore/Projects/VSP-LLM/experiments/ViAVSP-LLM_v1.4.1
	[2024-06-28 15:04:24,845][src.vsp_llm_training][INFO] - AVHubertPretrainingTask Config {'_name': 'vsp_llm_training', 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/100h/base', 'labels': ['wrd'], 'label_dir': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/100h/base', 'label_rate': -1, 'sample_rate': 16000, 'llm_ckpt_path': 'vilm/vinallama-2.7b', 'normalize': True, 'enable_padding': False, 'max_sample_size': 500, 'min_sample_size': None, 'max_trim_sample_size': '${task.max_sample_size}', 'single_target': True, 'random_crop': False, 'pad_audio': True, 'pdb': False, 'stack_order_audio': 4, 'skip_verify': False, 'image_aug': True, 'image_crop_size': 88, 'image_mean': 0.421, 'image_std': 0.165, 'modalities': ['video', 'audio'], 'is_s2s': True, 'tokenizer_bpe_name': None, 'tokenizer_bpe_model': None, 'noise_wav': None, 'noise_prob': 0.0, 'noise_snr': '0', 'noise_num': 1, 'fine_tuning': True}
	[2024-06-28 15:04:26,050][src.hubert_pretraining][INFO] - current directory is /home/theodore/Projects/VSP-LLM/experiments/ViAVSP-LLM_v1.4.1
	[2024-06-28 15:04:26,050][src.hubert_pretraining][INFO] - AVHubertPretrainingTask Config {'_name': 'av_hubert_pretraining', 'data': '/home/theodore/Projects/VSP-LLM/data/processed/vasr/100h/base', 'labels': ['km'], 'label_dir': '/checkpoint/bshi/data/lrs3//video/hubert/stitch-iters/envox-iter4-l12c2000/', 'label_rate': 25, 'sample_rate': 25, 'normalize': True, 'enable_padding': False, 'max_sample_size': 2000, 'min_sample_size': 5, 'max_trim_sample_size': 400, 'single_target': False, 'random_crop': True, 'pad_audio': False, 'pdb': False, 'stack_order_audio': 4, 'skip_verify': False, 'image_aug': True, 'image_crop_size': 88, 'image_mean': 0.421, 'image_std': 0.165, 'modalities': ['audio', 'video'], 'is_s2s': False, 'tokenizer_bpe_name': None, 'tokenizer_bpe_model': None, 'noise_wav': None, 'noise_prob': 0.0, 'noise_snr': '0', 'noise_num': 1, 'fine_tuning': False}
	[2024-06-28 15:04:26,054][src.hubert][INFO] - HubertModel Config: {'_name': 'av_hubert', 'label_rate': 25, 'input_modality': '${task.input_modality}', 'extractor_mode': default, 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': gelu, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.1, 'encoder_layerdrop': 0.1, 'dropout_input': 0.0, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 1.0, 'mask_length_audio': 10, 'mask_prob_audio': 0.8, 'mask_length_image': 5, 'mask_prob_image': 0.3, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'resnet_relu_type': 'prelu', 'resnet_weights': None, 'sim_type': 'cosine', 'sub_encoder_layers': 0, 'audio_feat_dim': 104, 'modality_dropout': 0.5, 'audio_dropout': 0.5, 'modality_fuse': 'concat', 'selection_type': 'same_seq', 'masking_type': 'input', 'decoder_embed_dim': 768, 'decoder_ffn_embed_dim': 3072, 'decoder_layers': 6, 'decoder_layerdrop': 0.0, 'decoder_attention_heads': 4, 'decoder_learned_pos': False, 'decoder_normalize_before': False, 'no_token_positional_embeddings': False, 'decoder_dropout': 0.1, 'decoder_attention_dropout': 0.1, 'decoder_activation_dropout': 0.0, 'max_target_positions': 2048, 'share_decoder_input_output_embed': False, 'no_scale_embedding': True}
	[2024-06-28 15:04:30,383][fairseq_cli.train][INFO] - avhubert_llm_seq2seq_cluster_count(
	(encoder): HubertEncoderWrapper(
	(w2v_model): AVHubertModel(
	(feature_extractor_audio): SubModel(
	(proj): Linear(in_features=104, out_features=1024, bias=True)
	)
	(feature_extractor_video): SubModel(
	(resnet): ResEncoder(
	(frontend3D): Sequential(
	(0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
	(1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(2): PReLU(num_parameters=64)
	(3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
	)
	(trunk): ResNet(
	(layer1): Sequential(
	(0): BasicBlock(
	(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=64)
	(relu2): PReLU(num_parameters=64)
	(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	(1): BasicBlock(
	(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=64)
	(relu2): PReLU(num_parameters=64)
	(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(layer2): Sequential(
	(0): BasicBlock(
	(conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=128)
	(relu2): PReLU(num_parameters=128)
	(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(downsample): Sequential(
	(0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(1): BasicBlock(
	(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=128)
	(relu2): PReLU(num_parameters=128)
	(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(layer3): Sequential(
	(0): BasicBlock(
	(conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=256)
	(relu2): PReLU(num_parameters=256)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(downsample): Sequential(
	(0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(1): BasicBlock(
	(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=256)
	(relu2): PReLU(num_parameters=256)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(layer4): Sequential(
	(0): BasicBlock(
	(conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=512)
	(relu2): PReLU(num_parameters=512)
	(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(downsample): Sequential(
	(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(1): BasicBlock(
	(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu1): PReLU(num_parameters=512)
	(relu2): PReLU(num_parameters=512)
	(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	(avgpool): AdaptiveAvgPool2d(output_size=1)
	)
	)
	(proj): Linear(in_features=512, out_features=1024, bias=True)
	)
	(post_extract_proj): Linear(in_features=2048, out_features=1024, bias=True)
	(dropout_input): Dropout(p=0.0, inplace=False)
	(dropout_features): Dropout(p=0.1, inplace=False)
	(encoder): TransformerEncoder(
	(pos_conv): Sequential(
	(0): Conv1d(1024, 1024, kernel_size=(128,), stride=(1,), padding=(64,), groups=16)
	(1): SamePad()
	(2): GELU(approximate='none')
	)
	(layers): ModuleList(
	(0-23): 24 x TransformerSentenceEncoderLayer(
	(self_attn): MultiheadAttention(
	(dropout_module): FairseqDropout()
	(k_proj): Linear(in_features=1024, out_features=1024, bias=True)
	(v_proj): Linear(in_features=1024, out_features=1024, bias=True)
	(q_proj): Linear(in_features=1024, out_features=1024, bias=True)
	(out_proj): Linear(in_features=1024, out_features=1024, bias=True)
	)
	(dropout1): Dropout(p=0.0, inplace=False)
	(dropout2): Dropout(p=0.1, inplace=False)
	(dropout3): Dropout(p=0.0, inplace=False)
	(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
	(fc1): Linear(in_features=1024, out_features=4096, bias=True)
	(fc2): Linear(in_features=4096, out_features=1024, bias=True)
	(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
	)
	)
	(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
	)
	(layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
	(final_proj): None
	)
	)
	(decoder): PeftModelForCausalLM(
	(base_model): LoraModel(
	(model): LlamaForCausalLM(
	(model): LlamaModel(
	(embed_tokens): Embedding(46304, 2560, padding_idx=0)
	(layers): ModuleList(
	(0-31): 32 x LlamaDecoderLayer(
	(self_attn): LlamaSdpaAttention(
	(q_proj): lora.Linear4bit(
	(base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False)
	(lora_dropout): ModuleDict(
	(default): Dropout(p=0.05, inplace=False)
	)
	(lora_A): ModuleDict(
	(default): Linear(in_features=2560, out_features=16, bias=False)
	)
	(lora_B): ModuleDict(
	(default): Linear(in_features=16, out_features=2560, bias=False)
	)
	(lora_embedding_A): ParameterDict()
	(lora_embedding_B): ParameterDict()
	)
	(k_proj): lora.Linear4bit(
	(base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False)
	(lora_dropout): ModuleDict(
	(default): Dropout(p=0.05, inplace=False)
	)
	(lora_A): ModuleDict(
	(default): Linear(in_features=2560, out_features=16, bias=False)
	)
	(lora_B): ModuleDict(
	(default): Linear(in_features=16, out_features=2560, bias=False)
	)
	(lora_embedding_A): ParameterDict()
	(lora_embedding_B): ParameterDict()
	)
	(v_proj): lora.Linear4bit(
	(base_layer): Linear4bit(in_features=2560, out_features=2560, bias=False)
	(lora_dropout): ModuleDict(
	(default): Dropout(p=0.05, inplace=False)
	)
	(lora_A): ModuleDict(
	(default): Linear(in_features=2560, out_features=16, bias=False)
	)
	(lora_B): ModuleDict(
	(default): Linear(in_features=16, out_features=2560, bias=False)
	)
	(lora_embedding_A): ParameterDict()
	(lora_embedding_B): ParameterDict()
	)
	(o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
	(rotary_emb): LlamaRotaryEmbedding()
	)
	(mlp): LlamaMLP(
	(gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
	(up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
	(down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False)
	(act_fn): SiLU()
	)
	(input_layernorm): LlamaRMSNorm()
	(post_attention_layernorm): LlamaRMSNorm()
	)
	)
	(norm): LlamaRMSNorm()
	)
	(lm_head): Linear(in_features=2560, out_features=46304, bias=False)
	)
	)
	)
	(avfeat_to_llm): Linear(in_features=1024, out_features=2560, bias=True)
	)
	[2024-06-28 15:04:30,388][fairseq_cli.train][INFO] - task: VSP_LLM_TrainingTask
	[2024-06-28 15:04:30,388][fairseq_cli.train][INFO] - model: avhubert_llm_seq2seq_cluster_count
	[2024-06-28 15:04:30,388][fairseq_cli.train][INFO] - criterion: decoder_only_language_modeling_loss
	[2024-06-28 15:04:30,392][fairseq_cli.train][INFO] - num. shared model params: 1,841,644,264 (num. trained: 335,624,424)
	[2024-06-28 15:04:30,394][fairseq_cli.train][INFO] - num. expert model params: 0 (num. trained: 0)
	[2024-06-28 15:04:30,395][src.vsp_llm_training][INFO] - Using tokenizer
	[2024-06-28 15:04:30,434][src.vsp_llm_dataset][INFO] - max_keep=500, min_keep=None, loaded 23990, skipped 0 short and 0 long and 0 unaligned, longest-loaded=76, shortest-loaded=76
	[2024-06-28 15:04:30,796][src.vsp_llm_dataset][INFO] - /home/theodore/Projects/VSP-LLM/data/processed/vasr/100h/base/valid.wrd is sequence label. skipped
	[2024-06-28 15:04:30,796][src.vsp_llm_dataset][INFO] - image transform: Compose(
	Normalize(mean=0.0, std=255.0)
	<src.utils_vsp_llm.CenterCrop object at 0x78c98d2ecca0>
	Normalize(mean=0.421, std=0.165)
	)
	[2024-06-28 15:04:30,796][src.vsp_llm_dataset][INFO] - pad_audio=True, random_crop=False, normalize=True, max_sample_size=500, seqs2seq data=True,
	[2024-06-28 15:04:30,796][src.vsp_llm_dataset][INFO] - Noise wav: None->0 wav, Prob: 0.0, SNR: 0, Number of mixture: 1
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer1.0.conv1.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer1.0.conv2.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer1.1.conv1.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer1.1.conv2.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.0.conv1.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.0.conv2.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.0.downsample.0.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.1.conv1.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer2.1.conv2.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.0.conv1.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.0.conv2.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.0.downsample.0.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.1.conv1.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer3.1.conv2.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.0.conv1.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.0.conv2.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.0.downsample.0.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.1.conv1.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- encoder.w2v_model.feature_extractor_video.resnet.trunk.layer4.1.conv2.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,970][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.self_attn.o_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.mlp.gate_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.mlp.up_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.0.mlp.down_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.self_attn.o_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.mlp.gate_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.mlp.up_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.1.mlp.down_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.self_attn.o_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.mlp.gate_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.mlp.up_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.2.mlp.down_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.self_attn.o_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.mlp.gate_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.mlp.up_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.3.mlp.down_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.self_attn.o_proj.bias
	[2024-06-28 15:04:30,971][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.mlp.gate_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.mlp.up_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.4.mlp.down_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.self_attn.o_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.mlp.gate_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.mlp.up_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.5.mlp.down_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.self_attn.o_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.mlp.gate_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.mlp.up_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.6.mlp.down_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.self_attn.o_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.mlp.gate_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.mlp.up_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.7.mlp.down_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.self_attn.o_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.mlp.gate_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.mlp.up_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.8.mlp.down_proj.bias
	[2024-06-28 15:04:30,972][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.self_attn.o_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.mlp.gate_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.mlp.up_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.9.mlp.down_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.self_attn.o_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.mlp.gate_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.mlp.up_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.10.mlp.down_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.self_attn.o_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.mlp.gate_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.mlp.up_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.11.mlp.down_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.self_attn.o_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.mlp.gate_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.mlp.up_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.12.mlp.down_proj.bias
	[2024-06-28 15:04:30,973][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.self_attn.o_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.mlp.gate_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.mlp.up_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.13.mlp.down_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.self_attn.o_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.mlp.gate_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.mlp.up_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.14.mlp.down_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.self_attn.o_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.mlp.gate_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.mlp.up_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.15.mlp.down_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.self_attn.o_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.mlp.gate_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.mlp.up_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.16.mlp.down_proj.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,974][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.self_attn.o_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.mlp.gate_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.mlp.up_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.17.mlp.down_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.self_attn.o_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.mlp.gate_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.mlp.up_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.18.mlp.down_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.self_attn.o_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.mlp.gate_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.mlp.up_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.19.mlp.down_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.self_attn.o_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.mlp.gate_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.mlp.up_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.20.mlp.down_proj.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,975][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.self_attn.o_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.mlp.gate_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.mlp.up_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.21.mlp.down_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.self_attn.o_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.mlp.gate_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.mlp.up_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.22.mlp.down_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.self_attn.o_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.mlp.gate_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.mlp.up_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.23.mlp.down_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.self_attn.o_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.mlp.gate_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.mlp.up_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.24.mlp.down_proj.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,976][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.self_attn.o_proj.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.mlp.gate_proj.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.mlp.up_proj.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.25.mlp.down_proj.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.self_attn.o_proj.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.mlp.gate_proj.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.mlp.up_proj.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.26.mlp.down_proj.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,977][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.self_attn.o_proj.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.mlp.gate_proj.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.mlp.up_proj.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.27.mlp.down_proj.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.self_attn.o_proj.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.mlp.gate_proj.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.mlp.up_proj.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.28.mlp.down_proj.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,978][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.self_attn.o_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.mlp.gate_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.mlp.up_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.29.mlp.down_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.self_attn.o_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.mlp.gate_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.mlp.up_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.30.mlp.down_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.q_proj.base_layer.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.q_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.q_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.k_proj.base_layer.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.v_proj.base_layer.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.v_proj.lora_A.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.self_attn.o_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.mlp.gate_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.mlp.up_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.model.layers.31.mlp.down_proj.bias
	[2024-06-28 15:04:30,979][fairseq.trainer][INFO] - detected shared parameter: encoder.w2v_model.feature_extractor_video.resnet.frontend3D.0.bias <- decoder.base_model.model.lm_head.bias
	[2024-06-28 15:04:30,979][fairseq.utils][INFO] - *********************CUDA enviroments for all 1 workers*********************
	[2024-06-28 15:04:30,979][fairseq.utils][INFO] - rank 0: capabilities = 8.6 ; total memory = 15.729 GB ; name = NVIDIA RTX A4000
	[2024-06-28 15:04:30,979][fairseq.utils][INFO] - *********************CUDA enviroments for all 1 workers*********************
	[2024-06-28 15:04:30,980][fairseq_cli.train][INFO] - training on 1 devices (GPUs/TPUs)
	[2024-06-28 15:04:30,980][fairseq_cli.train][INFO] - max tokens per device = None and max sentences per device = 1
	[2024-06-28 15:04:30,980][fairseq.trainer][INFO] - Preparing to load checkpoint checkpoints/checkpoint_last.pt
	[2024-06-28 15:04:30,980][fairseq.trainer][INFO] - No existing checkpoint found checkpoints/checkpoint_last.pt
	[2024-06-28 15:04:30,980][fairseq.trainer][INFO] - loading train data for epoch 1
	[2024-06-28 15:04:30,980][src.vsp_llm_training][INFO] - Using tokenizer
	[2024-06-28 15:04:31,160][src.vsp_llm_dataset][INFO] - max_keep=500, min_keep=None, loaded 120686, skipped 0 short and 0 long and 0 unaligned, longest-loaded=76, shortest-loaded=73
	[2024-06-28 15:04:32,079][src.vsp_llm_dataset][INFO] - /home/theodore/Projects/VSP-LLM/data/processed/vasr/100h/base/train.wrd is sequence label. skipped
	[2024-06-28 15:04:32,079][src.vsp_llm_dataset][INFO] - image transform: Compose(
	Normalize(mean=0.0, std=255.0)
	RandomCrop(size=(88, 88))
	<src.utils_vsp_llm.HorizontalFlip object at 0x78c98d7f4610>
	Normalize(mean=0.421, std=0.165)
	)
	[2024-06-28 15:04:32,079][src.vsp_llm_dataset][INFO] - pad_audio=True, random_crop=False, normalize=True, max_sample_size=500, seqs2seq data=True,
	[2024-06-28 15:04:32,079][src.vsp_llm_dataset][INFO] - Noise wav: None->0 wav, Prob: 0.0, SNR: 0, Number of mixture: 1
	[2024-06-28 15:04:37,047][fairseq.trainer][INFO] - begin training epoch 1
	[2024-06-28 15:04:37,048][fairseq_cli.train][INFO] - Start iterating over samples
	[2024-06-28 15:10:05,110][train_inner][INFO] - {"epoch": 1, "update": 0.013, "loss": "7.619", "ntokens": "126.725", "acc_total": "126.725", "n_correct": "18.26", "wer_total": "126.725", "n_error": "108.385", "ppl": "196.62", "accuracy": "14.409", "wer": "85.528", "wps": "77.4", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "200", "lr": "1.49e-05", "gnorm": "8.779", "loss_scale": "128", "train_wall": "327", "gb_free": "7.1", "wall": "334"}
	[2024-06-28 15:15:34,943][train_inner][INFO] - {"epoch": 1, "update": 0.027, "loss": "6.196", "ntokens": "126.93", "acc_total": "126.93", "n_correct": "25.775", "wer_total": "126.93", "n_error": "100.96", "ppl": "73.32", "accuracy": "20.306", "wer": "79.54", "wps": "77", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "400", "lr": "2.48e-05", "gnorm": "3.728", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "664"}
	[2024-06-28 15:21:04,562][train_inner][INFO] - {"epoch": 1, "update": 0.04, "loss": "6.072", "ntokens": "127.015", "acc_total": "127.015", "n_correct": "28.655", "wer_total": "127.015", "n_error": "98.075", "ppl": "67.27", "accuracy": "22.56", "wer": "77.215", "wps": "77.1", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "600", "lr": "3.47e-05", "gnorm": "3.962", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "994"}
	[2024-06-28 15:26:34,367][train_inner][INFO] - {"epoch": 1, "update": 0.053, "loss": "5.867", "ntokens": "126.865", "acc_total": "126.865", "n_correct": "30.67", "wer_total": "126.865", "n_error": "95.935", "ppl": "58.38", "accuracy": "24.175", "wer": "75.62", "wps": "76.9", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "800", "lr": "4.46e-05", "gnorm": "4.11", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "1323"}
	[2024-06-28 15:32:03,983][train_inner][INFO] - {"epoch": 1, "update": 0.066, "loss": "5.932", "ntokens": "127.025", "acc_total": "127.025", "n_correct": "30.47", "wer_total": "127.025", "n_error": "96.275", "ppl": "61.05", "accuracy": "23.987", "wer": "75.792", "wps": "77.1", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "1000", "lr": "5.45e-05", "gnorm": "3.8", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "1653"}
	[2024-06-28 15:37:33,509][train_inner][INFO] - {"epoch": 1, "update": 0.08, "loss": "5.882", "ntokens": "127.095", "acc_total": "127.095", "n_correct": "31.01", "wer_total": "127.095", "n_error": "95.865", "ppl": "58.99", "accuracy": "24.399", "wer": "75.428", "wps": "77.1", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "1200", "lr": "6.44e-05", "gnorm": "3.576", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "1983"}
	[2024-06-28 15:43:03,141][train_inner][INFO] - {"epoch": 1, "update": 0.093, "loss": "5.724", "ntokens": "127.62", "acc_total": "127.62", "n_correct": "32.185", "wer_total": "127.62", "n_error": "95.23", "ppl": "52.85", "accuracy": "25.219", "wer": "74.62", "wps": "77.4", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "1400", "lr": "7.43e-05", "gnorm": "3.416", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "2312"}
	[2024-06-28 15:48:32,588][train_inner][INFO] - {"epoch": 1, "update": 0.106, "loss": "5.736", "ntokens": "127.41", "acc_total": "127.41", "n_correct": "32.635", "wer_total": "127.41", "n_error": "94.51", "ppl": "53.29", "accuracy": "25.614", "wer": "74.178", "wps": "77.3", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "1600", "lr": "8.42e-05", "gnorm": "3.143", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "2642"}
	[2024-06-28 15:54:02,283][train_inner][INFO] - {"epoch": 1, "update": 0.119, "loss": "5.777", "ntokens": "126.56", "acc_total": "126.56", "n_correct": "32.16", "wer_total": "126.56", "n_error": "94.16", "ppl": "54.84", "accuracy": "25.411", "wer": "74.399", "wps": "76.8", "ups": "0.61", "wpb": "126.6", "bsz": "8", "num_updates": "1800", "lr": "9.41e-05", "gnorm": "2.975", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "2971"}
	[2024-06-28 15:59:31,782][train_inner][INFO] - {"epoch": 1, "update": 0.133, "loss": "5.678", "ntokens": "126.875", "acc_total": "126.875", "n_correct": "33.55", "wer_total": "126.875", "n_error": "93.105", "ppl": "51.19", "accuracy": "26.443", "wer": "73.383", "wps": "77", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "2000", "lr": "0.000104", "gnorm": "2.808", "loss_scale": "128", "train_wall": "329", "gb_free": "7.1", "wall": "3301"}
	[2024-06-28 16:05:01,465][train_inner][INFO] - {"epoch": 1, "update": 0.146, "loss": "5.596", "ntokens": "128.2", "acc_total": "128.2", "n_correct": "34.86", "wer_total": "128.2", "n_error": "93.035", "ppl": "48.38", "accuracy": "27.192", "wer": "72.57", "wps": "77.8", "ups": "0.61", "wpb": "128.2", "bsz": "8", "num_updates": "2200", "lr": "0.0001139", "gnorm": "2.781", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "3630"}
	[2024-06-28 16:10:31,180][train_inner][INFO] - {"epoch": 1, "update": 0.159, "loss": "5.49", "ntokens": "127.775", "acc_total": "127.775", "n_correct": "37.17", "wer_total": "127.775", "n_error": "90.4", "ppl": "44.93", "accuracy": "29.09", "wer": "70.749", "wps": "77.5", "ups": "0.61", "wpb": "127.8", "bsz": "8", "num_updates": "2400", "lr": "0.0001238", "gnorm": "2.898", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "3960"}
	[2024-06-28 16:13:16,070][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-28 16:55:14,394][valid][INFO] - {"epoch": 1, "valid_loss": "5.238", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "5.72176", "valid_wer_total": "18.1585", "valid_n_error": "12.414", "valid_ppl": "37.75", "valid_accuracy": "31.51", "valid_wer": "68.365", "valid_wps": "173", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "2500"}
	[2024-06-28 16:55:14,394][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 2500 updates
	[2024-06-28 16:55:14,395][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_2500.pt
	[2024-06-28 16:55:17,656][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_2500.pt
	[2024-06-28 16:55:20,765][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_2500.pt (epoch 1 @ 2500 updates, score 31.51) (writing took 6.370615527965128 seconds)
	[2024-06-28 16:58:05,201][train_inner][INFO] - {"epoch": 1, "update": 0.172, "loss": "5.303", "ntokens": "126.92", "acc_total": "126.92", "n_correct": "39.855", "wer_total": "126.92", "n_error": "86.795", "ppl": "39.47", "accuracy": "31.402", "wer": "68.386", "wps": "8.9", "ups": "0.07", "wpb": "126.9", "bsz": "8", "num_updates": "2600", "lr": "0.0001337", "gnorm": "3.272", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "6814"}
	[2024-06-28 17:03:34,801][train_inner][INFO] - {"epoch": 1, "update": 0.186, "loss": "5.138", "ntokens": "125.685", "acc_total": "125.685", "n_correct": "42.305", "wer_total": "125.685", "n_error": "83.105", "ppl": "35.22", "accuracy": "33.66", "wer": "66.122", "wps": "76.3", "ups": "0.61", "wpb": "125.7", "bsz": "8", "num_updates": "2800", "lr": "0.0001436", "gnorm": "3.73", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "7144"}
	[2024-06-28 17:09:04,126][train_inner][INFO] - {"epoch": 1, "update": 0.199, "loss": "4.915", "ntokens": "127.19", "acc_total": "127.19", "n_correct": "46.61", "wer_total": "127.19", "n_error": "80.315", "ppl": "30.17", "accuracy": "36.646", "wer": "63.146", "wps": "77.2", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "3000", "lr": "0.0001535", "gnorm": "3.993", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "7473"}
	[2024-06-28 17:14:33,625][train_inner][INFO] - {"epoch": 1, "update": 0.212, "loss": "4.638", "ntokens": "126.535", "acc_total": "126.535", "n_correct": "49.94", "wer_total": "126.535", "n_error": "76.375", "ppl": "24.9", "accuracy": "39.467", "wer": "60.359", "wps": "76.8", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "3200", "lr": "0.0001634", "gnorm": "4.188", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "7803"}
	[2024-06-28 17:20:03,104][train_inner][INFO] - {"epoch": 1, "update": 0.225, "loss": "4.469", "ntokens": "126.53", "acc_total": "126.53", "n_correct": "52.54", "wer_total": "126.53", "n_error": "73.81", "ppl": "22.15", "accuracy": "41.524", "wer": "58.334", "wps": "76.8", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "3400", "lr": "0.0001733", "gnorm": "4.282", "loss_scale": "256", "train_wall": "329", "gb_free": "7.1", "wall": "8132"}
	[2024-06-28 17:25:32,258][train_inner][INFO] - {"epoch": 1, "update": 0.239, "loss": "4.223", "ntokens": "127.025", "acc_total": "127.025", "n_correct": "55.58", "wer_total": "127.025", "n_error": "71.27", "ppl": "18.67", "accuracy": "43.755", "wer": "56.107", "wps": "77.2", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "3600", "lr": "0.0001832", "gnorm": "4.387", "loss_scale": "256", "train_wall": "328", "gb_free": "7.1", "wall": "8461"}
	[2024-06-28 17:31:01,415][train_inner][INFO] - {"epoch": 1, "update": 0.252, "loss": "4.08", "ntokens": "127.35", "acc_total": "127.35", "n_correct": "57.55", "wer_total": "127.35", "n_error": "69.605", "ppl": "16.91", "accuracy": "45.19", "wer": "54.656", "wps": "77.4", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "3800", "lr": "0.0001931", "gnorm": "4.439", "loss_scale": "256", "train_wall": "328", "gb_free": "7.1", "wall": "8790"}
	[2024-06-28 17:36:30,577][train_inner][INFO] - {"epoch": 1, "update": 0.265, "loss": "3.95", "ntokens": "127.785", "acc_total": "127.785", "n_correct": "59.515", "wer_total": "127.785", "n_error": "68.105", "ppl": "15.45", "accuracy": "46.574", "wer": "53.297", "wps": "77.6", "ups": "0.61", "wpb": "127.8", "bsz": "8", "num_updates": "4000", "lr": "0.000203", "gnorm": "4.404", "loss_scale": "256", "train_wall": "328", "gb_free": "7.1", "wall": "9120"}
	[2024-06-28 17:41:59,531][train_inner][INFO] - {"epoch": 1, "update": 0.278, "loss": "3.789", "ntokens": "126.255", "acc_total": "126.255", "n_correct": "61.005", "wer_total": "126.255", "n_error": "65.09", "ppl": "13.82", "accuracy": "48.319", "wer": "51.554", "wps": "76.8", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "4200", "lr": "0.0002129", "gnorm": "4.425", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "9449"}
	[2024-06-28 17:47:28,588][train_inner][INFO] - {"epoch": 1, "update": 0.292, "loss": "3.672", "ntokens": "125.9", "acc_total": "125.9", "n_correct": "61.95", "wer_total": "125.9", "n_error": "63.805", "ppl": "12.74", "accuracy": "49.206", "wer": "50.679", "wps": "76.5", "ups": "0.61", "wpb": "125.9", "bsz": "8", "num_updates": "4400", "lr": "0.0002228", "gnorm": "4.36", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "9778"}
	[2024-06-28 17:52:57,711][train_inner][INFO] - {"epoch": 1, "update": 0.305, "loss": "3.533", "ntokens": "127.885", "acc_total": "127.885", "n_correct": "64.97", "wer_total": "127.885", "n_error": "62.755", "ppl": "11.58", "accuracy": "50.803", "wer": "49.071", "wps": "77.7", "ups": "0.61", "wpb": "127.9", "bsz": "8", "num_updates": "4600", "lr": "0.0002327", "gnorm": "4.357", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "10107"}
	[2024-06-28 17:58:26,851][train_inner][INFO] - {"epoch": 1, "update": 0.318, "loss": "3.427", "ntokens": "126.21", "acc_total": "126.21", "n_correct": "65.225", "wer_total": "126.21", "n_error": "60.875", "ppl": "10.76", "accuracy": "51.68", "wer": "48.233", "wps": "76.7", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "4800", "lr": "0.0002426", "gnorm": "4.341", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "10436"}
	[2024-06-28 18:03:55,894][train_inner][INFO] - {"epoch": 1, "update": 0.331, "loss": "3.361", "ntokens": "126.87", "acc_total": "126.87", "n_correct": "66.375", "wer_total": "126.87", "n_error": "60.31", "ppl": "10.28", "accuracy": "52.317", "wer": "47.537", "wps": "77.1", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "5000", "lr": "0.0002525", "gnorm": "4.314", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "10765"}
	[2024-06-28 18:03:55,894][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-28 18:45:48,710][valid][INFO] - {"epoch": 1, "valid_loss": "2.995", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "10.2574", "valid_wer_total": "18.1585", "valid_n_error": "7.88499", "valid_ppl": "7.97", "valid_accuracy": "56.488", "valid_wer": "43.423", "valid_wps": "173.4", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "5000", "valid_best_accuracy": "56.488"}
	[2024-06-28 18:45:48,710][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 5000 updates
	[2024-06-28 18:45:48,711][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_5000.pt
	[2024-06-28 18:45:51,985][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_5000.pt
	[2024-06-28 18:45:55,936][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_5000.pt (epoch 1 @ 5000 updates, score 56.488) (writing took 7.225767091149464 seconds)
	[2024-06-28 18:51:24,287][train_inner][INFO] - {"epoch": 1, "update": 0.345, "loss": "3.133", "ntokens": "126.51", "acc_total": "126.51", "n_correct": "69.09", "wer_total": "126.51", "n_error": "57.27", "ppl": "8.77", "accuracy": "54.612", "wer": "45.269", "wps": "8.9", "ups": "0.07", "wpb": "126.5", "bsz": "8", "num_updates": "5200", "lr": "0.0002624", "gnorm": "4.176", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "13613"}
	[2024-06-28 18:56:52,805][train_inner][INFO] - {"epoch": 1, "update": 0.358, "loss": "3.151", "ntokens": "127.425", "acc_total": "127.425", "n_correct": "69.105", "wer_total": "127.425", "n_error": "58.2", "ppl": "8.88", "accuracy": "54.232", "wer": "45.674", "wps": "77.6", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "5400", "lr": "0.0002723", "gnorm": "4.227", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "13942"}
	[2024-06-28 19:02:21,576][train_inner][INFO] - {"epoch": 1, "update": 0.371, "loss": "3.017", "ntokens": "127.52", "acc_total": "127.52", "n_correct": "70.66", "wer_total": "127.52", "n_error": "56.73", "ppl": "8.1", "accuracy": "55.411", "wer": "44.487", "wps": "77.6", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "5600", "lr": "0.0002822", "gnorm": "4.252", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "14271"}
	[2024-06-28 19:07:50,161][train_inner][INFO] - {"epoch": 1, "update": 0.384, "loss": "3.133", "ntokens": "126.59", "acc_total": "126.59", "n_correct": "69.22", "wer_total": "126.59", "n_error": "57.2", "ppl": "8.77", "accuracy": "54.68", "wer": "45.185", "wps": "77.1", "ups": "0.61", "wpb": "126.6", "bsz": "8", "num_updates": "5800", "lr": "0.0002921", "gnorm": "4.121", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "14599"}
	[2024-06-28 19:13:18,745][train_inner][INFO] - {"epoch": 1, "update": 0.398, "loss": "2.982", "ntokens": "127.395", "acc_total": "127.395", "n_correct": "71.1", "wer_total": "127.395", "n_error": "56.175", "ppl": "7.9", "accuracy": "55.811", "wer": "44.095", "wps": "77.5", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "6000", "lr": "0.000302", "gnorm": "4.064", "loss_scale": "512", "train_wall": "328", "gb_free": "7.1", "wall": "14928"}
	[2024-06-28 19:18:47,224][train_inner][INFO] - {"epoch": 1, "update": 0.411, "loss": "2.905", "ntokens": "126.995", "acc_total": "126.995", "n_correct": "71.91", "wer_total": "126.995", "n_error": "54.935", "ppl": "7.49", "accuracy": "56.624", "wer": "43.258", "wps": "77.3", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "6200", "lr": "0.0003119", "gnorm": "4.076", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "15256"}
	[2024-06-28 19:24:15,804][train_inner][INFO] - {"epoch": 1, "update": 0.424, "loss": "2.962", "ntokens": "126.01", "acc_total": "126.01", "n_correct": "70.81", "wer_total": "126.01", "n_error": "55.125", "ppl": "7.79", "accuracy": "56.194", "wer": "43.747", "wps": "76.7", "ups": "0.61", "wpb": "126", "bsz": "8", "num_updates": "6400", "lr": "0.0003218", "gnorm": "4.064", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "15585"}
	[2024-06-28 19:29:44,329][train_inner][INFO] - {"epoch": 1, "update": 0.437, "loss": "2.826", "ntokens": "126.33", "acc_total": "126.33", "n_correct": "73.01", "wer_total": "126.33", "n_error": "53.2", "ppl": "7.09", "accuracy": "57.793", "wer": "42.112", "wps": "76.9", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "6600", "lr": "0.0003317", "gnorm": "4.099", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "15913"}
	[2024-06-28 19:35:13,152][train_inner][INFO] - {"epoch": 1, "update": 0.451, "loss": "2.799", "ntokens": "126.735", "acc_total": "126.735", "n_correct": "73.21", "wer_total": "126.735", "n_error": "53.36", "ppl": "6.96", "accuracy": "57.766", "wer": "42.104", "wps": "77.1", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "6800", "lr": "0.0003416", "gnorm": "3.991", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "16242"}
	[2024-06-28 19:40:41,979][train_inner][INFO] - {"epoch": 1, "update": 0.464, "loss": "2.685", "ntokens": "126.98", "acc_total": "126.98", "n_correct": "75.195", "wer_total": "126.98", "n_error": "51.67", "ppl": "6.43", "accuracy": "59.218", "wer": "40.691", "wps": "77.2", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "7000", "lr": "0.0003515", "gnorm": "3.923", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "16571"}
	[2024-06-28 19:46:10,531][train_inner][INFO] - {"epoch": 1, "update": 0.477, "loss": "2.789", "ntokens": "126.97", "acc_total": "126.97", "n_correct": "73.915", "wer_total": "126.97", "n_error": "52.975", "ppl": "6.91", "accuracy": "58.215", "wer": "41.722", "wps": "77.3", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "7200", "lr": "0.0003614", "gnorm": "4.067", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "16900"}
	[2024-06-28 19:51:38,996][train_inner][INFO] - {"epoch": 1, "update": 0.491, "loss": "2.606", "ntokens": "127.45", "acc_total": "127.45", "n_correct": "76.485", "wer_total": "127.45", "n_error": "50.85", "ppl": "6.09", "accuracy": "60.012", "wer": "39.898", "wps": "77.6", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "7400", "lr": "0.0003713", "gnorm": "3.982", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "17228"}
	[2024-06-28 19:54:23,368][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-28 20:36:15,211][valid][INFO] - {"epoch": 1, "valid_loss": "2.382", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "11.4619", "valid_wer_total": "18.1585", "valid_n_error": "6.68278", "valid_ppl": "5.21", "valid_accuracy": "63.121", "valid_wer": "36.803", "valid_wps": "173.4", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "7500", "valid_best_accuracy": "63.121"}
	[2024-06-28 20:36:15,212][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 7500 updates
	[2024-06-28 20:36:15,212][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_7500.pt
	[2024-06-28 20:36:18,557][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_7500.pt
	[2024-06-28 20:36:22,500][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_7500.pt (epoch 1 @ 7500 updates, score 63.121) (writing took 7.287791126174852 seconds)
	[2024-06-28 20:39:06,518][train_inner][INFO] - {"epoch": 1, "update": 0.504, "loss": "2.64", "ntokens": "127.32", "acc_total": "127.32", "n_correct": "76.035", "wer_total": "127.32", "n_error": "51.205", "ppl": "6.23", "accuracy": "59.72", "wer": "40.218", "wps": "8.9", "ups": "0.07", "wpb": "127.3", "bsz": "8", "num_updates": "7600", "lr": "0.0003812", "gnorm": "3.95", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "20076"}
	[2024-06-28 20:44:35,456][train_inner][INFO] - {"epoch": 1, "update": 0.517, "loss": "2.656", "ntokens": "127.11", "acc_total": "127.11", "n_correct": "75.465", "wer_total": "127.11", "n_error": "51.495", "ppl": "6.3", "accuracy": "59.37", "wer": "40.512", "wps": "77.3", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "7800", "lr": "0.0003911", "gnorm": "3.982", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "20404"}
	[2024-06-28 20:50:04,075][train_inner][INFO] - {"epoch": 1, "update": 0.53, "loss": "2.623", "ntokens": "126.875", "acc_total": "126.875", "n_correct": "75.86", "wer_total": "126.875", "n_error": "50.855", "ppl": "6.16", "accuracy": "59.791", "wer": "40.083", "wps": "77.2", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "8000", "lr": "0.000401", "gnorm": "3.897", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "20733"}
	[2024-06-28 20:55:32,678][train_inner][INFO] - {"epoch": 1, "update": 0.544, "loss": "2.568", "ntokens": "126.05", "acc_total": "126.05", "n_correct": "76.07", "wer_total": "126.05", "n_error": "49.925", "ppl": "5.93", "accuracy": "60.349", "wer": "39.607", "wps": "76.7", "ups": "0.61", "wpb": "126", "bsz": "8", "num_updates": "8200", "lr": "0.0004109", "gnorm": "4.062", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "21062"}
	[2024-06-28 21:01:01,368][train_inner][INFO] - {"epoch": 1, "update": 0.557, "loss": "2.513", "ntokens": "126.785", "acc_total": "126.785", "n_correct": "77.595", "wer_total": "126.785", "n_error": "49.1", "ppl": "5.71", "accuracy": "61.202", "wer": "38.727", "wps": "77.1", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "8400", "lr": "0.0004208", "gnorm": "3.975", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "21390"}
	[2024-06-28 21:06:30,168][train_inner][INFO] - {"epoch": 1, "update": 0.57, "loss": "2.515", "ntokens": "126.675", "acc_total": "126.675", "n_correct": "77.115", "wer_total": "126.675", "n_error": "49.45", "ppl": "5.72", "accuracy": "60.876", "wer": "39.037", "wps": "77.1", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "8600", "lr": "0.0004307", "gnorm": "3.978", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "21719"}
	[2024-06-28 21:11:58,800][train_inner][INFO] - {"epoch": 1, "update": 0.583, "loss": "2.535", "ntokens": "127.09", "acc_total": "127.09", "n_correct": "77.52", "wer_total": "127.09", "n_error": "49.485", "ppl": "5.79", "accuracy": "60.996", "wer": "38.937", "wps": "77.3", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "8800", "lr": "0.0004406", "gnorm": "4.186", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "22048"}
	[2024-06-28 21:17:27,390][train_inner][INFO] - {"epoch": 1, "update": 0.597, "loss": "2.516", "ntokens": "127.43", "acc_total": "127.43", "n_correct": "77.715", "wer_total": "127.43", "n_error": "49.59", "ppl": "5.72", "accuracy": "60.986", "wer": "38.915", "wps": "77.6", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "9000", "lr": "0.0004505", "gnorm": "3.946", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "22376"}
	[2024-06-28 21:22:56,165][train_inner][INFO] - {"epoch": 1, "update": 0.61, "loss": "2.491", "ntokens": "127.145", "acc_total": "127.145", "n_correct": "78.13", "wer_total": "127.145", "n_error": "48.935", "ppl": "5.62", "accuracy": "61.45", "wer": "38.488", "wps": "77.3", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "9200", "lr": "0.0004604", "gnorm": "4.104", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "22705"}
	[2024-06-28 21:28:24,905][train_inner][INFO] - {"epoch": 1, "update": 0.623, "loss": "2.46", "ntokens": "125.995", "acc_total": "125.995", "n_correct": "77.745", "wer_total": "125.995", "n_error": "48.155", "ppl": "5.5", "accuracy": "61.705", "wer": "38.22", "wps": "76.7", "ups": "0.61", "wpb": "126", "bsz": "8", "num_updates": "9400", "lr": "0.0004703", "gnorm": "3.915", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "23034"}
	[2024-06-28 21:33:53,658][train_inner][INFO] - {"epoch": 1, "update": 0.636, "loss": "2.456", "ntokens": "126.87", "acc_total": "126.87", "n_correct": "78.03", "wer_total": "126.87", "n_error": "48.73", "ppl": "5.49", "accuracy": "61.504", "wer": "38.409", "wps": "77.2", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "9600", "lr": "0.0004802", "gnorm": "4.17", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "23363"}
	[2024-06-28 21:39:22,409][train_inner][INFO] - {"epoch": 1, "update": 0.65, "loss": "2.425", "ntokens": "125.68", "acc_total": "125.68", "n_correct": "77.97", "wer_total": "125.68", "n_error": "47.56", "ppl": "5.37", "accuracy": "62.039", "wer": "37.842", "wps": "76.5", "ups": "0.61", "wpb": "125.7", "bsz": "8", "num_updates": "9800", "lr": "0.0004901", "gnorm": "4.048", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "23691"}
	[2024-06-28 21:44:51,191][train_inner][INFO] - {"epoch": 1, "update": 0.663, "loss": "2.431", "ntokens": "127.56", "acc_total": "127.56", "n_correct": "78.275", "wer_total": "127.56", "n_error": "49.185", "ppl": "5.39", "accuracy": "61.363", "wer": "38.558", "wps": "77.6", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "10000", "lr": "0.0005", "gnorm": "4.052", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "24020"}
	[2024-06-28 21:44:51,191][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-28 22:26:43,998][valid][INFO] - {"epoch": 1, "valid_loss": "nan", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "11.9498", "valid_wer_total": "18.1585", "valid_n_error": "6.19742", "valid_ppl": "nan", "valid_accuracy": "65.808", "valid_wer": "34.13", "valid_wps": "173.4", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "10000", "valid_best_accuracy": "65.808"}
	[2024-06-28 22:26:43,999][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 10000 updates
	[2024-06-28 22:26:43,999][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_10000.pt
	[2024-06-28 22:26:47,289][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_10000.pt
	[2024-06-28 22:26:51,306][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_10000.pt (epoch 1 @ 10000 updates, score 65.808) (writing took 7.306680917041376 seconds)
	[2024-06-28 22:32:19,719][train_inner][INFO] - {"epoch": 1, "update": 0.676, "loss": "2.41", "ntokens": "126.605", "acc_total": "126.605", "n_correct": "78.365", "wer_total": "126.605", "n_error": "48.165", "ppl": "5.31", "accuracy": "61.897", "wer": "38.044", "wps": "8.9", "ups": "0.07", "wpb": "126.6", "bsz": "8", "num_updates": "10200", "lr": "0.000485243", "gnorm": "4.147", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "26869"}
	[2024-06-28 22:37:48,587][train_inner][INFO] - {"epoch": 1, "update": 0.689, "loss": "2.387", "ntokens": "127.295", "acc_total": "127.295", "n_correct": "79.3", "wer_total": "127.295", "n_error": "47.85", "ppl": "5.23", "accuracy": "62.296", "wer": "37.59", "wps": "77.4", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "10400", "lr": "0.000470922", "gnorm": "4.021", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "27198"}
	[2024-06-28 22:40:23,046][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0
	[2024-06-28 22:43:18,877][train_inner][INFO] - {"epoch": 1, "update": 0.703, "loss": "2.375", "ntokens": "127.815", "acc_total": "127.815", "n_correct": "79.705", "wer_total": "127.815", "n_error": "48.035", "ppl": "5.19", "accuracy": "62.36", "wer": "37.582", "wps": "77.4", "ups": "0.61", "wpb": "127.8", "bsz": "8", "num_updates": "10600", "lr": "0.000457024", "gnorm": "3.875", "loss_scale": "2048", "train_wall": "330", "gb_free": "7.1", "wall": "27528"}
	[2024-06-28 22:48:47,615][train_inner][INFO] - {"epoch": 1, "update": 0.716, "loss": "2.373", "ntokens": "127.33", "acc_total": "127.33", "n_correct": "80.14", "wer_total": "127.33", "n_error": "47.12", "ppl": "5.18", "accuracy": "62.939", "wer": "37.006", "wps": "77.5", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "10800", "lr": "0.000443536", "gnorm": "4.01", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "27857"}
	[2024-06-28 22:54:16,657][train_inner][INFO] - {"epoch": 1, "update": 0.729, "loss": "2.271", "ntokens": "126.09", "acc_total": "126.09", "n_correct": "80.62", "wer_total": "126.09", "n_error": "45.38", "ppl": "4.83", "accuracy": "63.938", "wer": "35.99", "wps": "76.6", "ups": "0.61", "wpb": "126.1", "bsz": "8", "num_updates": "11000", "lr": "0.000430446", "gnorm": "4.067", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "28186"}
	[2024-06-28 22:59:45,546][train_inner][INFO] - {"epoch": 1, "update": 0.742, "loss": "2.23", "ntokens": "126.625", "acc_total": "126.625", "n_correct": "81.96", "wer_total": "126.625", "n_error": "44.595", "ppl": "4.69", "accuracy": "64.727", "wer": "35.218", "wps": "77", "ups": "0.61", "wpb": "126.6", "bsz": "8", "num_updates": "11200", "lr": "0.000417742", "gnorm": "3.832", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "28515"}
	[2024-06-28 23:05:14,425][train_inner][INFO] - {"epoch": 1, "update": 0.756, "loss": "2.226", "ntokens": "125.82", "acc_total": "125.82", "n_correct": "82.005", "wer_total": "125.82", "n_error": "43.75", "ppl": "4.68", "accuracy": "65.176", "wer": "34.772", "wps": "76.5", "ups": "0.61", "wpb": "125.8", "bsz": "8", "num_updates": "11400", "lr": "0.000405413", "gnorm": "3.907", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "28843"}
	[2024-06-28 23:10:43,209][train_inner][INFO] - {"epoch": 1, "update": 0.769, "loss": "2.219", "ntokens": "126.955", "acc_total": "126.955", "n_correct": "83.135", "wer_total": "126.955", "n_error": "43.72", "ppl": "4.65", "accuracy": "65.484", "wer": "34.437", "wps": "77.2", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "11600", "lr": "0.000393448", "gnorm": "3.78", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "29172"}
	[2024-06-28 23:16:12,010][train_inner][INFO] - {"epoch": 1, "update": 0.782, "loss": "2.183", "ntokens": "126.705", "acc_total": "126.705", "n_correct": "83.16", "wer_total": "126.705", "n_error": "43.495", "ppl": "4.54", "accuracy": "65.633", "wer": "34.328", "wps": "77.1", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "11800", "lr": "0.000381836", "gnorm": "3.962", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "29501"}
	[2024-06-28 23:21:40,975][train_inner][INFO] - {"epoch": 1, "update": 0.796, "loss": "2.136", "ntokens": "126.675", "acc_total": "126.675", "n_correct": "83.685", "wer_total": "126.675", "n_error": "42.885", "ppl": "4.4", "accuracy": "66.063", "wer": "33.854", "wps": "77", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "12000", "lr": "0.000370567", "gnorm": "3.721", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "29830"}
	[2024-06-28 23:27:10,101][train_inner][INFO] - {"epoch": 1, "update": 0.809, "loss": "2.16", "ntokens": "126.855", "acc_total": "126.855", "n_correct": "83.59", "wer_total": "126.855", "n_error": "43.21", "ppl": "4.47", "accuracy": "65.894", "wer": "34.063", "wps": "77.1", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "12200", "lr": "0.000359631", "gnorm": "3.797", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "30159"}
	[2024-06-28 23:32:39,001][train_inner][INFO] - {"epoch": 1, "update": 0.822, "loss": "2.116", "ntokens": "126.96", "acc_total": "126.96", "n_correct": "84.48", "wer_total": "126.96", "n_error": "42.425", "ppl": "4.34", "accuracy": "66.541", "wer": "33.416", "wps": "77.2", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "12400", "lr": "0.000349017", "gnorm": "3.694", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "30488"}
	[2024-06-28 23:35:23,364][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 00:17:51,139][valid][INFO] - {"epoch": 1, "valid_loss": "1.822", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.0769", "valid_wer_total": "18.1585", "valid_n_error": "5.07645", "valid_ppl": "3.54", "valid_accuracy": "72.016", "valid_wer": "27.956", "valid_wps": "171", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "12500", "valid_best_accuracy": "72.016"}
	[2024-06-29 00:17:51,139][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 12500 updates
	[2024-06-29 00:17:51,140][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_12500.pt
	[2024-06-29 00:17:54,367][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_12500.pt
	[2024-06-29 00:17:59,427][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_12500.pt (epoch 1 @ 12500 updates, score 72.016) (writing took 8.287915420951322 seconds)
	[2024-06-29 00:20:17,229][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0
	[2024-06-29 00:20:45,251][train_inner][INFO] - {"epoch": 1, "update": 0.835, "loss": "2.038", "ntokens": "126.665", "acc_total": "126.665", "n_correct": "85.455", "wer_total": "126.665", "n_error": "41.16", "ppl": "4.11", "accuracy": "67.465", "wer": "32.495", "wps": "8.8", "ups": "0.07", "wpb": "126.7", "bsz": "8", "num_updates": "12600", "lr": "0.000338716", "gnorm": "3.566", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "33374"}
	[2024-06-29 00:25:54,703][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0
	[2024-06-29 00:26:16,098][train_inner][INFO] - {"epoch": 1, "update": 0.849, "loss": "2.018", "ntokens": "127.345", "acc_total": "127.345", "n_correct": "85.87", "wer_total": "127.345", "n_error": "41.375", "ppl": "4.05", "accuracy": "67.431", "wer": "32.49", "wps": "77", "ups": "0.6", "wpb": "127.3", "bsz": "8", "num_updates": "12800", "lr": "0.00032872", "gnorm": "3.57", "loss_scale": "1024", "train_wall": "330", "gb_free": "7.1", "wall": "33705"}
	[2024-06-29 00:31:45,037][train_inner][INFO] - {"epoch": 1, "update": 0.862, "loss": "2.029", "ntokens": "125.855", "acc_total": "125.855", "n_correct": "88.09", "wer_total": "125.855", "n_error": "37.735", "ppl": "4.08", "accuracy": "69.993", "wer": "29.983", "wps": "76.5", "ups": "0.61", "wpb": "125.9", "bsz": "8", "num_updates": "13000", "lr": "0.000319018", "gnorm": "3.781", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "34034"}
	[2024-06-29 00:37:14,146][train_inner][INFO] - {"epoch": 1, "update": 0.875, "loss": "2.045", "ntokens": "126.95", "acc_total": "126.95", "n_correct": "88.045", "wer_total": "126.95", "n_error": "38.83", "ppl": "4.13", "accuracy": "69.354", "wer": "30.587", "wps": "77.1", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "13200", "lr": "0.000309603", "gnorm": "3.704", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "34363"}
	[2024-06-29 00:42:43,160][train_inner][INFO] - {"epoch": 1, "update": 0.888, "loss": "2.035", "ntokens": "127.54", "acc_total": "127.54", "n_correct": "88.515", "wer_total": "127.54", "n_error": "38.97", "ppl": "4.1", "accuracy": "69.402", "wer": "30.555", "wps": "77.5", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "13400", "lr": "0.000300466", "gnorm": "3.558", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "34692"}
	[2024-06-29 00:48:12,040][train_inner][INFO] - {"epoch": 1, "update": 0.902, "loss": "1.95", "ntokens": "126.8", "acc_total": "126.8", "n_correct": "88.05", "wer_total": "126.8", "n_error": "38.705", "ppl": "3.86", "accuracy": "69.44", "wer": "30.524", "wps": "77.1", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "13600", "lr": "0.000291598", "gnorm": "3.39", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "35021"}
	[2024-06-29 00:53:41,066][train_inner][INFO] - {"epoch": 1, "update": 0.915, "loss": "1.894", "ntokens": "127.03", "acc_total": "127.03", "n_correct": "87.845", "wer_total": "127.03", "n_error": "39.125", "ppl": "3.72", "accuracy": "69.153", "wer": "30.8", "wps": "77.2", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "13800", "lr": "0.000282992", "gnorm": "3.462", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "35350"}
	[2024-06-29 00:59:09,570][train_inner][INFO] - {"epoch": 1, "update": 0.928, "loss": "1.941", "ntokens": "127.19", "acc_total": "127.19", "n_correct": "86.74", "wer_total": "127.19", "n_error": "40.405", "ppl": "3.84", "accuracy": "68.197", "wer": "31.767", "wps": "77.4", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "14000", "lr": "0.00027464", "gnorm": "3.512", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "35679"}
	[2024-06-29 01:04:37,960][train_inner][INFO] - {"epoch": 1, "update": 0.941, "loss": "1.908", "ntokens": "126.04", "acc_total": "126.04", "n_correct": "88.88", "wer_total": "126.04", "n_error": "37.12", "ppl": "3.75", "accuracy": "70.517", "wer": "29.451", "wps": "76.8", "ups": "0.61", "wpb": "126", "bsz": "8", "num_updates": "14200", "lr": "0.000266535", "gnorm": "3.561", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "36007"}
	[2024-06-29 01:10:06,371][train_inner][INFO] - {"epoch": 1, "update": 0.955, "loss": "1.919", "ntokens": "126.84", "acc_total": "126.84", "n_correct": "89.88", "wer_total": "126.84", "n_error": "36.9", "ppl": "3.78", "accuracy": "70.861", "wer": "29.092", "wps": "77.2", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "14400", "lr": "0.000258668", "gnorm": "3.505", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "36335"}
	[2024-06-29 01:15:34,957][train_inner][INFO] - {"epoch": 1, "update": 0.968, "loss": "1.85", "ntokens": "127.515", "acc_total": "127.515", "n_correct": "91.025", "wer_total": "127.515", "n_error": "36.46", "ppl": "3.6", "accuracy": "71.384", "wer": "28.593", "wps": "77.6", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "14600", "lr": "0.000251034", "gnorm": "3.45", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "36664"}
	[2024-06-29 01:21:03,414][train_inner][INFO] - {"epoch": 1, "update": 0.981, "loss": "1.909", "ntokens": "127.235", "acc_total": "127.235", "n_correct": "89.72", "wer_total": "127.235", "n_error": "37.495", "ppl": "3.75", "accuracy": "70.515", "wer": "29.469", "wps": "77.5", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "14800", "lr": "0.000243626", "gnorm": "3.51", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "36992"}
	[2024-06-29 01:26:32,135][train_inner][INFO] - {"epoch": 1, "update": 0.994, "loss": "1.876", "ntokens": "127.245", "acc_total": "127.245", "n_correct": "87.47", "wer_total": "127.245", "n_error": "39.72", "ppl": "3.67", "accuracy": "68.741", "wer": "31.215", "wps": "77.4", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "15000", "lr": "0.000236435", "gnorm": "3.594", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "37321"}
	[2024-06-29 01:26:32,135][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 02:08:22,315][valid][INFO] - {"epoch": 1, "valid_loss": "1.573", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.1794", "valid_wer_total": "18.1585", "valid_n_error": "4.97224", "valid_ppl": "2.98", "valid_accuracy": "72.58", "valid_wer": "27.382", "valid_wps": "173.5", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "15000", "valid_best_accuracy": "72.58"}
	[2024-06-29 02:08:22,316][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 15000 updates
	[2024-06-29 02:08:22,316][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_1_15000.pt
	[2024-06-29 02:08:25,494][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_1_15000.pt
	[2024-06-29 02:08:30,465][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_1_15000.pt (epoch 1 @ 15000 updates, score 72.58) (writing took 8.149334152927622 seconds)
	[2024-06-29 02:10:46,042][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 02:52:37,112][valid][INFO] - {"epoch": 1, "valid_loss": "1.595", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.1946", "valid_wer_total": "18.1585", "valid_n_error": "4.95861", "valid_ppl": "3.02", "valid_accuracy": "72.663", "valid_wer": "27.307", "valid_wps": "173.5", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "15083", "valid_best_accuracy": "72.663"}
	[2024-06-29 02:52:37,113][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 1 @ 15083 updates
	[2024-06-29 02:52:37,113][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_best.pt
	[2024-06-29 02:52:40,737][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_best.pt
	[2024-06-29 02:52:42,751][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_best.pt (epoch 1 @ 15083 updates, score 72.663) (writing took 5.637887550983578 seconds)
	[2024-06-29 02:52:42,751][fairseq_cli.train][INFO] - end of epoch 1 (average epoch stats below)
	[2024-06-29 02:52:42,789][train][INFO] - {"epoch": 1, "train_loss": "3.277", "train_ntokens": "126.898", "train_acc_total": "126.898", "train_n_correct": "67.8917", "train_wer_total": "126.898", "train_n_error": "58.876", "train_ppl": "9.69", "train_accuracy": "53.501", "train_wer": "46.396", "train_wps": "45", "train_ups": "0.36", "train_wpb": "126.9", "train_bsz": "8", "train_num_updates": "15083", "train_lr": "0.000233514", "train_gnorm": "3.927", "train_loss_scale": "2048", "train_train_wall": "24760", "train_gb_free": "7.1", "train_wall": "42492"}
	[2024-06-29 02:52:42,842][fairseq.trainer][INFO] - begin training epoch 2
	[2024-06-29 02:52:42,842][fairseq_cli.train][INFO] - Start iterating over samples
	[2024-06-29 02:55:54,765][train_inner][INFO] - {"epoch": 2, "update": 1.008, "loss": "1.766", "ntokens": "127.23", "acc_total": "127.23", "n_correct": "89.775", "wer_total": "127.23", "n_error": "37.405", "ppl": "3.4", "accuracy": "70.561", "wer": "29.4", "wps": "4.7", "ups": "0.04", "wpb": "127.2", "bsz": "8", "num_updates": "15200", "lr": "0.000229457", "gnorm": "3.321", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "42684"}
	[2024-06-29 03:01:23,167][train_inner][INFO] - {"epoch": 2, "update": 1.021, "loss": "1.708", "ntokens": "126.325", "acc_total": "126.325", "n_correct": "89.845", "wer_total": "126.325", "n_error": "36.42", "ppl": "3.27", "accuracy": "71.122", "wer": "28.83", "wps": "76.9", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "15400", "lr": "0.000222685", "gnorm": "3.16", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "43012"}
	[2024-06-29 03:06:51,569][train_inner][INFO] - {"epoch": 2, "update": 1.034, "loss": "1.695", "ntokens": "126.915", "acc_total": "126.915", "n_correct": "89.59", "wer_total": "126.915", "n_error": "37.29", "ppl": "3.24", "accuracy": "70.591", "wer": "29.382", "wps": "77.3", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "15600", "lr": "0.000216113", "gnorm": "3.374", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "43341"}
	[2024-06-29 03:12:19,936][train_inner][INFO] - {"epoch": 2, "update": 1.048, "loss": "1.752", "ntokens": "125.545", "acc_total": "125.545", "n_correct": "87.82", "wer_total": "125.545", "n_error": "37.69", "ppl": "3.37", "accuracy": "69.951", "wer": "30.021", "wps": "76.5", "ups": "0.61", "wpb": "125.5", "bsz": "8", "num_updates": "15800", "lr": "0.000209735", "gnorm": "3.26", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "43669"}
	[2024-06-29 03:17:48,592][train_inner][INFO] - {"epoch": 2, "update": 1.061, "loss": "1.712", "ntokens": "127.405", "acc_total": "127.405", "n_correct": "89.44", "wer_total": "127.405", "n_error": "37.91", "ppl": "3.28", "accuracy": "70.201", "wer": "29.756", "wps": "77.5", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "16000", "lr": "0.000203545", "gnorm": "3.118", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "43998"}
	[2024-06-29 03:23:17,153][train_inner][INFO] - {"epoch": 2, "update": 1.074, "loss": "1.595", "ntokens": "126.915", "acc_total": "126.915", "n_correct": "91.515", "wer_total": "126.915", "n_error": "35.37", "ppl": "3.02", "accuracy": "72.107", "wer": "27.869", "wps": "77.3", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "16200", "lr": "0.000197538", "gnorm": "3.096", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "44326"}
	[2024-06-29 03:28:45,468][train_inner][INFO] - {"epoch": 2, "update": 1.087, "loss": "1.633", "ntokens": "127.14", "acc_total": "127.14", "n_correct": "92.085", "wer_total": "127.14", "n_error": "35.02", "ppl": "3.1", "accuracy": "72.428", "wer": "27.544", "wps": "77.5", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "16400", "lr": "0.000191708", "gnorm": "3.18", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "44654"}
	[2024-06-29 03:34:13,911][train_inner][INFO] - {"epoch": 2, "update": 1.101, "loss": "1.613", "ntokens": "127.51", "acc_total": "127.51", "n_correct": "92.955", "wer_total": "127.51", "n_error": "34.525", "ppl": "3.06", "accuracy": "72.9", "wer": "27.076", "wps": "77.6", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "16600", "lr": "0.00018605", "gnorm": "3.116", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "44983"}
	[2024-06-29 03:39:42,440][train_inner][INFO] - {"epoch": 2, "update": 1.114, "loss": "1.656", "ntokens": "126.405", "acc_total": "126.405", "n_correct": "92.035", "wer_total": "126.405", "n_error": "34.32", "ppl": "3.15", "accuracy": "72.81", "wer": "27.151", "wps": "77", "ups": "0.61", "wpb": "126.4", "bsz": "8", "num_updates": "16800", "lr": "0.000180559", "gnorm": "3.041", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "45311"}
	[2024-06-29 03:45:11,139][train_inner][INFO] - {"epoch": 2, "update": 1.127, "loss": "1.591", "ntokens": "127.925", "acc_total": "127.925", "n_correct": "93.58", "wer_total": "127.925", "n_error": "34.315", "ppl": "3.01", "accuracy": "73.152", "wer": "26.824", "wps": "77.8", "ups": "0.61", "wpb": "127.9", "bsz": "8", "num_updates": "17000", "lr": "0.00017523", "gnorm": "3.042", "loss_scale": "4096", "train_wall": "328", "gb_free": "7.1", "wall": "45640"}
	[2024-06-29 03:45:42,274][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0
	[2024-06-29 03:50:41,185][train_inner][INFO] - {"epoch": 2, "update": 1.14, "loss": "1.607", "ntokens": "126.05", "acc_total": "126.05", "n_correct": "91.49", "wer_total": "126.05", "n_error": "34.52", "ppl": "3.05", "accuracy": "72.582", "wer": "27.386", "wps": "76.4", "ups": "0.61", "wpb": "126", "bsz": "8", "num_updates": "17200", "lr": "0.000170059", "gnorm": "2.913", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "45970"}
	[2024-06-29 03:56:09,559][train_inner][INFO] - {"epoch": 2, "update": 1.154, "loss": "1.634", "ntokens": "127.295", "acc_total": "127.295", "n_correct": "91.775", "wer_total": "127.295", "n_error": "35.485", "ppl": "3.1", "accuracy": "72.096", "wer": "27.876", "wps": "77.5", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "17400", "lr": "0.00016504", "gnorm": "3.048", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "46299"}
	[2024-06-29 03:58:53,778][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 04:40:41,814][valid][INFO] - {"epoch": 2, "valid_loss": "1.464", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.7075", "valid_wer_total": "18.1585", "valid_n_error": "4.44614", "valid_ppl": "2.76", "valid_accuracy": "75.488", "valid_wer": "24.485", "valid_wps": "173.7", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "17500", "valid_best_accuracy": "75.488"}
	[2024-06-29 04:40:41,818][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 17500 updates
	[2024-06-29 04:40:41,819][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_17500.pt
	[2024-06-29 04:40:45,052][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_17500.pt
	[2024-06-29 04:40:50,241][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_17500.pt (epoch 2 @ 17500 updates, score 75.488) (writing took 8.422401812160388 seconds)
	[2024-06-29 04:43:33,994][train_inner][INFO] - {"epoch": 2, "update": 1.167, "loss": "1.628", "ntokens": "126.19", "acc_total": "126.19", "n_correct": "91.7", "wer_total": "126.19", "n_error": "34.455", "ppl": "3.09", "accuracy": "72.668", "wer": "27.304", "wps": "8.9", "ups": "0.07", "wpb": "126.2", "bsz": "8", "num_updates": "17600", "lr": "0.000160169", "gnorm": "3.182", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "49143"}
	[2024-06-29 04:49:02,353][train_inner][INFO] - {"epoch": 2, "update": 1.18, "loss": "1.52", "ntokens": "128.19", "acc_total": "128.19", "n_correct": "93.425", "wer_total": "128.19", "n_error": "34.745", "ppl": "2.87", "accuracy": "72.88", "wer": "27.104", "wps": "78.1", "ups": "0.61", "wpb": "128.2", "bsz": "8", "num_updates": "17800", "lr": "0.000155442", "gnorm": "3.045", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "49471"}
	[2024-06-29 04:54:30,839][train_inner][INFO] - {"epoch": 2, "update": 1.193, "loss": "1.608", "ntokens": "126.265", "acc_total": "126.265", "n_correct": "90.785", "wer_total": "126.265", "n_error": "35.44", "ppl": "3.05", "accuracy": "71.9", "wer": "28.068", "wps": "76.9", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "18000", "lr": "0.000150854", "gnorm": "2.995", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "49800"}
	[2024-06-29 04:59:59,311][train_inner][INFO] - {"epoch": 2, "update": 1.207, "loss": "1.572", "ntokens": "127.715", "acc_total": "127.715", "n_correct": "92.815", "wer_total": "127.715", "n_error": "34.87", "ppl": "2.97", "accuracy": "72.674", "wer": "27.303", "wps": "77.8", "ups": "0.61", "wpb": "127.7", "bsz": "8", "num_updates": "18200", "lr": "0.000146402", "gnorm": "2.973", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "50128"}
	[2024-06-29 05:05:27,649][train_inner][INFO] - {"epoch": 2, "update": 1.22, "loss": "1.57", "ntokens": "126.27", "acc_total": "126.27", "n_correct": "91.185", "wer_total": "126.27", "n_error": "34.995", "ppl": "2.97", "accuracy": "72.214", "wer": "27.714", "wps": "76.9", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "18400", "lr": "0.000142081", "gnorm": "2.985", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "50457"}
	[2024-06-29 05:10:56,019][train_inner][INFO] - {"epoch": 2, "update": 1.233, "loss": "1.562", "ntokens": "126.87", "acc_total": "126.87", "n_correct": "91.645", "wer_total": "126.87", "n_error": "35.2", "ppl": "2.95", "accuracy": "72.235", "wer": "27.745", "wps": "77.3", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "18600", "lr": "0.000137888", "gnorm": "3.032", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "50785"}
	[2024-06-29 05:16:24,366][train_inner][INFO] - {"epoch": 2, "update": 1.246, "loss": "1.548", "ntokens": "127.155", "acc_total": "127.155", "n_correct": "91.83", "wer_total": "127.155", "n_error": "35.28", "ppl": "2.92", "accuracy": "72.219", "wer": "27.746", "wps": "77.5", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "18800", "lr": "0.000133819", "gnorm": "2.911", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "51113"}
	[2024-06-29 05:21:52,769][train_inner][INFO] - {"epoch": 2, "update": 1.26, "loss": "1.516", "ntokens": "127.695", "acc_total": "127.695", "n_correct": "92.36", "wer_total": "127.695", "n_error": "35.305", "ppl": "2.86", "accuracy": "72.329", "wer": "27.648", "wps": "77.8", "ups": "0.61", "wpb": "127.7", "bsz": "8", "num_updates": "19000", "lr": "0.000129869", "gnorm": "3.038", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "51442"}
	[2024-06-29 05:24:55,008][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0
	[2024-06-29 05:27:22,862][train_inner][INFO] - {"epoch": 2, "update": 1.273, "loss": "1.592", "ntokens": "127.1", "acc_total": "127.1", "n_correct": "91.67", "wer_total": "127.1", "n_error": "35.41", "ppl": "3.01", "accuracy": "72.124", "wer": "27.86", "wps": "77", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "19200", "lr": "0.000126036", "gnorm": "3.009", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "51772"}
	[2024-06-29 05:32:51,290][train_inner][INFO] - {"epoch": 2, "update": 1.286, "loss": "1.459", "ntokens": "125.85", "acc_total": "125.85", "n_correct": "92.785", "wer_total": "125.85", "n_error": "33.025", "ppl": "2.75", "accuracy": "73.727", "wer": "26.242", "wps": "76.6", "ups": "0.61", "wpb": "125.8", "bsz": "8", "num_updates": "19400", "lr": "0.000122317", "gnorm": "2.939", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "52100"}
	[2024-06-29 05:38:19,693][train_inner][INFO] - {"epoch": 2, "update": 1.3, "loss": "1.512", "ntokens": "125.665", "acc_total": "125.665", "n_correct": "91.54", "wer_total": "125.665", "n_error": "34.095", "ppl": "2.85", "accuracy": "72.844", "wer": "27.132", "wps": "76.5", "ups": "0.61", "wpb": "125.7", "bsz": "8", "num_updates": "19600", "lr": "0.000118707", "gnorm": "2.897", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "52429"}
	[2024-06-29 05:43:48,095][train_inner][INFO] - {"epoch": 2, "update": 1.313, "loss": "1.484", "ntokens": "127.615", "acc_total": "127.615", "n_correct": "93.5", "wer_total": "127.615", "n_error": "34.1", "ppl": "2.8", "accuracy": "73.267", "wer": "26.721", "wps": "77.7", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "19800", "lr": "0.000115203", "gnorm": "2.97", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "52757"}
	[2024-06-29 05:49:16,663][train_inner][INFO] - {"epoch": 2, "update": 1.326, "loss": "1.541", "ntokens": "126.165", "acc_total": "126.165", "n_correct": "92.175", "wer_total": "126.165", "n_error": "33.975", "ppl": "2.91", "accuracy": "73.059", "wer": "26.929", "wps": "76.8", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "20000", "lr": "0.000111803", "gnorm": "2.89", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "53086"}
	[2024-06-29 05:49:16,664][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 06:31:03,741][valid][INFO] - {"epoch": 2, "valid_loss": "1.385", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.7218", "valid_wer_total": "18.1585", "valid_n_error": "4.43235", "valid_ppl": "2.61", "valid_accuracy": "75.567", "valid_wer": "24.409", "valid_wps": "173.8", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "20000", "valid_best_accuracy": "75.567"}
	[2024-06-29 06:31:03,741][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 20000 updates
	[2024-06-29 06:31:03,742][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_20000.pt
	[2024-06-29 06:31:06,966][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_20000.pt
	[2024-06-29 06:31:12,344][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_20000.pt (epoch 2 @ 20000 updates, score 75.567) (writing took 8.602206402923912 seconds)
	[2024-06-29 06:36:39,694][train_inner][INFO] - {"epoch": 2, "update": 1.339, "loss": "1.534", "ntokens": "126.37", "acc_total": "126.37", "n_correct": "91.965", "wer_total": "126.37", "n_error": "34.375", "ppl": "2.9", "accuracy": "72.774", "wer": "27.202", "wps": "8.9", "ups": "0.07", "wpb": "126.4", "bsz": "8", "num_updates": "20200", "lr": "0.000108504", "gnorm": "2.975", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "55929"}
	[2024-06-29 06:42:07,960][train_inner][INFO] - {"epoch": 2, "update": 1.353, "loss": "1.553", "ntokens": "127.095", "acc_total": "127.095", "n_correct": "92.495", "wer_total": "127.095", "n_error": "34.59", "ppl": "2.93", "accuracy": "72.776", "wer": "27.216", "wps": "77.4", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "20400", "lr": "0.000105301", "gnorm": "2.862", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "56257"}
	[2024-06-29 06:47:35,864][train_inner][INFO] - {"epoch": 2, "update": 1.366, "loss": "1.506", "ntokens": "126.5", "acc_total": "126.5", "n_correct": "92.445", "wer_total": "126.5", "n_error": "34.015", "ppl": "2.84", "accuracy": "73.079", "wer": "26.889", "wps": "77.2", "ups": "0.61", "wpb": "126.5", "bsz": "8", "num_updates": "20600", "lr": "0.000102194", "gnorm": "2.975", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "56585"}
	[2024-06-29 06:53:04,063][train_inner][INFO] - {"epoch": 2, "update": 1.379, "loss": "1.493", "ntokens": "127.265", "acc_total": "127.265", "n_correct": "93.54", "wer_total": "127.265", "n_error": "33.71", "ppl": "2.81", "accuracy": "73.5", "wer": "26.488", "wps": "77.6", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "20800", "lr": "9.91776e-05", "gnorm": "2.944", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "56913"}
	[2024-06-29 06:58:32,057][train_inner][INFO] - {"epoch": 2, "update": 1.392, "loss": "1.486", "ntokens": "127.405", "acc_total": "127.405", "n_correct": "93.685", "wer_total": "127.405", "n_error": "33.665", "ppl": "2.8", "accuracy": "73.533", "wer": "26.424", "wps": "77.7", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "21000", "lr": "9.62506e-05", "gnorm": "2.851", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "57241"}
	[2024-06-29 07:03:59,915][train_inner][INFO] - {"epoch": 2, "update": 1.406, "loss": "1.472", "ntokens": "126.7", "acc_total": "126.7", "n_correct": "94.165", "wer_total": "126.7", "n_error": "32.51", "ppl": "2.77", "accuracy": "74.321", "wer": "25.659", "wps": "77.3", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "21200", "lr": "9.341e-05", "gnorm": "2.773", "loss_scale": "4096", "train_wall": "327", "gb_free": "7.1", "wall": "57569"}
	[2024-06-29 07:09:27,979][train_inner][INFO] - {"epoch": 2, "update": 1.419, "loss": "1.464", "ntokens": "127.32", "acc_total": "127.32", "n_correct": "93.8", "wer_total": "127.32", "n_error": "33.505", "ppl": "2.76", "accuracy": "73.673", "wer": "26.316", "wps": "77.6", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "21400", "lr": "9.06532e-05", "gnorm": "2.811", "loss_scale": "4096", "train_wall": "327", "gb_free": "7.1", "wall": "57897"}
	[2024-06-29 07:11:03,088][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0
	[2024-06-29 07:14:57,562][train_inner][INFO] - {"epoch": 2, "update": 1.432, "loss": "1.485", "ntokens": "126.76", "acc_total": "126.76", "n_correct": "92.77", "wer_total": "126.76", "n_error": "33.95", "ppl": "2.8", "accuracy": "73.186", "wer": "26.783", "wps": "76.9", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "21600", "lr": "8.79777e-05", "gnorm": "2.873", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "58227"}
	[2024-06-29 07:20:25,544][train_inner][INFO] - {"epoch": 2, "update": 1.445, "loss": "1.431", "ntokens": "125.58", "acc_total": "125.58", "n_correct": "93.515", "wer_total": "125.58", "n_error": "32.04", "ppl": "2.7", "accuracy": "74.466", "wer": "25.514", "wps": "76.6", "ups": "0.61", "wpb": "125.6", "bsz": "8", "num_updates": "21800", "lr": "8.53812e-05", "gnorm": "2.806", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "58555"}
	[2024-06-29 07:25:53,563][train_inner][INFO] - {"epoch": 2, "update": 1.459, "loss": "1.526", "ntokens": "127.565", "acc_total": "127.565", "n_correct": "93.9", "wer_total": "127.565", "n_error": "33.63", "ppl": "2.88", "accuracy": "73.61", "wer": "26.363", "wps": "77.8", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "22000", "lr": "8.28614e-05", "gnorm": "2.893", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "58883"}
	[2024-06-29 07:31:21,503][train_inner][INFO] - {"epoch": 2, "update": 1.472, "loss": "1.414", "ntokens": "126.58", "acc_total": "126.58", "n_correct": "94.395", "wer_total": "126.58", "n_error": "32.15", "ppl": "2.66", "accuracy": "74.573", "wer": "25.399", "wps": "77.2", "ups": "0.61", "wpb": "126.6", "bsz": "8", "num_updates": "22200", "lr": "8.04159e-05", "gnorm": "2.799", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "59211"}
	[2024-06-29 07:36:49,577][train_inner][INFO] - {"epoch": 2, "update": 1.485, "loss": "1.466", "ntokens": "127.075", "acc_total": "127.075", "n_correct": "93.815", "wer_total": "127.075", "n_error": "33.215", "ppl": "2.76", "accuracy": "73.826", "wer": "26.138", "wps": "77.5", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "22400", "lr": "7.80425e-05", "gnorm": "2.82", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "59539"}
	[2024-06-29 07:39:33,699][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 08:21:21,137][valid][INFO] - {"epoch": 2, "valid_loss": "1.321", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.9083", "valid_wer_total": "18.1585", "valid_n_error": "4.24669", "valid_ppl": "2.5", "valid_accuracy": "76.594", "valid_wer": "23.387", "valid_wps": "173.7", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "22500", "valid_best_accuracy": "76.594"}
	[2024-06-29 08:21:21,138][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 22500 updates
	[2024-06-29 08:21:21,138][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_22500.pt
	[2024-06-29 08:21:24,406][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_22500.pt
	[2024-06-29 08:21:29,692][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_22500.pt (epoch 2 @ 22500 updates, score 76.594) (writing took 8.554857108043507 seconds)
	[2024-06-29 08:24:13,261][train_inner][INFO] - {"epoch": 2, "update": 1.498, "loss": "1.48", "ntokens": "126.79", "acc_total": "126.79", "n_correct": "93.47", "wer_total": "126.79", "n_error": "33.305", "ppl": "2.79", "accuracy": "73.72", "wer": "26.268", "wps": "8.9", "ups": "0.07", "wpb": "126.8", "bsz": "8", "num_updates": "22600", "lr": "7.57393e-05", "gnorm": "2.931", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "62382"}
	[2024-06-29 08:29:41,476][train_inner][INFO] - {"epoch": 2, "update": 1.512, "loss": "1.441", "ntokens": "126.58", "acc_total": "126.58", "n_correct": "93.455", "wer_total": "126.58", "n_error": "33.09", "ppl": "2.71", "accuracy": "73.831", "wer": "26.142", "wps": "77.1", "ups": "0.61", "wpb": "126.6", "bsz": "8", "num_updates": "22800", "lr": "7.3504e-05", "gnorm": "2.735", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "62710"}
	[2024-06-29 08:35:09,520][train_inner][INFO] - {"epoch": 2, "update": 1.525, "loss": "1.524", "ntokens": "126.155", "acc_total": "126.155", "n_correct": "92.715", "wer_total": "126.155", "n_error": "33.41", "ppl": "2.88", "accuracy": "73.493", "wer": "26.483", "wps": "76.9", "ups": "0.61", "wpb": "126.2", "bsz": "8", "num_updates": "23000", "lr": "7.13346e-05", "gnorm": "3.005", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "63039"}
	[2024-06-29 08:38:29,613][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0
	[2024-06-29 08:40:39,435][train_inner][INFO] - {"epoch": 2, "update": 1.538, "loss": "1.43", "ntokens": "126.92", "acc_total": "126.92", "n_correct": "94.615", "wer_total": "126.92", "n_error": "32.28", "ppl": "2.69", "accuracy": "74.547", "wer": "25.433", "wps": "76.9", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "23200", "lr": "6.92293e-05", "gnorm": "2.769", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "63368"}
	[2024-06-29 08:46:07,701][train_inner][INFO] - {"epoch": 2, "update": 1.552, "loss": "1.361", "ntokens": "126.32", "acc_total": "126.32", "n_correct": "95", "wer_total": "126.32", "n_error": "31.3", "ppl": "2.57", "accuracy": "75.206", "wer": "24.778", "wps": "77", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "23400", "lr": "6.71862e-05", "gnorm": "2.748", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "63697"}
	[2024-06-29 08:51:35,994][train_inner][INFO] - {"epoch": 2, "update": 1.565, "loss": "1.5", "ntokens": "126.68", "acc_total": "126.68", "n_correct": "93.88", "wer_total": "126.68", "n_error": "32.79", "ppl": "2.83", "accuracy": "74.108", "wer": "25.884", "wps": "77.2", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "23600", "lr": "6.52033e-05", "gnorm": "2.989", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "64025"}
	[2024-06-29 08:57:04,281][train_inner][INFO] - {"epoch": 2, "update": 1.578, "loss": "1.449", "ntokens": "127.13", "acc_total": "127.13", "n_correct": "94.685", "wer_total": "127.13", "n_error": "32.405", "ppl": "2.73", "accuracy": "74.479", "wer": "25.49", "wps": "77.5", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "23800", "lr": "6.3279e-05", "gnorm": "2.882", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "64353"}
	[2024-06-29 09:02:32,436][train_inner][INFO] - {"epoch": 2, "update": 1.591, "loss": "1.459", "ntokens": "126.725", "acc_total": "126.725", "n_correct": "94.04", "wer_total": "126.725", "n_error": "32.675", "ppl": "2.75", "accuracy": "74.208", "wer": "25.784", "wps": "77.2", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "24000", "lr": "6.14114e-05", "gnorm": "2.86", "loss_scale": "1024", "train_wall": "327", "gb_free": "7.1", "wall": "64681"}
	[2024-06-29 09:08:00,785][train_inner][INFO] - {"epoch": 2, "update": 1.605, "loss": "1.456", "ntokens": "126.27", "acc_total": "126.27", "n_correct": "93.66", "wer_total": "126.27", "n_error": "32.58", "ppl": "2.74", "accuracy": "74.174", "wer": "25.802", "wps": "76.9", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "24200", "lr": "5.9599e-05", "gnorm": "2.813", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "65010"}
	[2024-06-29 09:13:29,281][train_inner][INFO] - {"epoch": 2, "update": 1.618, "loss": "1.441", "ntokens": "127.195", "acc_total": "127.195", "n_correct": "95.695", "wer_total": "127.195", "n_error": "31.48", "ppl": "2.71", "accuracy": "75.235", "wer": "24.749", "wps": "77.4", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "24400", "lr": "5.784e-05", "gnorm": "2.848", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "65338"}
	[2024-06-29 09:18:57,844][train_inner][INFO] - {"epoch": 2, "update": 1.631, "loss": "1.51", "ntokens": "127.34", "acc_total": "127.34", "n_correct": "94.18", "wer_total": "127.34", "n_error": "33.135", "ppl": "2.85", "accuracy": "73.959", "wer": "26.021", "wps": "77.5", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "24600", "lr": "5.6133e-05", "gnorm": "2.807", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "65667"}
	[2024-06-29 09:24:26,431][train_inner][INFO] - {"epoch": 2, "update": 1.644, "loss": "1.425", "ntokens": "127.295", "acc_total": "127.295", "n_correct": "95.295", "wer_total": "127.295", "n_error": "31.99", "ppl": "2.69", "accuracy": "74.862", "wer": "25.131", "wps": "77.5", "ups": "0.61", "wpb": "127.3", "bsz": "8", "num_updates": "24800", "lr": "5.44763e-05", "gnorm": "2.782", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "65995"}
	[2024-06-29 09:29:55,123][train_inner][INFO] - {"epoch": 2, "update": 1.658, "loss": "1.447", "ntokens": "125.91", "acc_total": "125.91", "n_correct": "93.74", "wer_total": "125.91", "n_error": "32.145", "ppl": "2.73", "accuracy": "74.45", "wer": "25.53", "wps": "76.6", "ups": "0.61", "wpb": "125.9", "bsz": "8", "num_updates": "25000", "lr": "5.28686e-05", "gnorm": "2.717", "loss_scale": "1024", "train_wall": "328", "gb_free": "7.1", "wall": "66324"}
	[2024-06-29 09:29:55,123][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 10:11:43,263][valid][INFO] - {"epoch": 2, "valid_loss": "1.285", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "13.8924", "valid_wer_total": "18.1585", "valid_n_error": "4.26307", "valid_ppl": "2.44", "valid_accuracy": "76.506", "valid_wer": "23.477", "valid_wps": "173.7", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "25000", "valid_best_accuracy": "76.594"}
	[2024-06-29 10:11:43,263][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 25000 updates
	[2024-06-29 10:11:43,264][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_25000.pt
	[2024-06-29 10:11:46,558][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_25000.pt
	[2024-06-29 10:11:48,698][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_25000.pt (epoch 2 @ 25000 updates, score 76.506) (writing took 5.4349518560338765 seconds)
	[2024-06-29 10:17:16,784][train_inner][INFO] - {"epoch": 2, "update": 1.671, "loss": "1.379", "ntokens": "127.435", "acc_total": "127.435", "n_correct": "94.985", "wer_total": "127.435", "n_error": "32.425", "ppl": "2.6", "accuracy": "74.536", "wer": "25.444", "wps": "9", "ups": "0.07", "wpb": "127.4", "bsz": "8", "num_updates": "25200", "lr": "5.13083e-05", "gnorm": "2.727", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "69166"}
	[2024-06-29 10:22:45,159][train_inner][INFO] - {"epoch": 2, "update": 1.684, "loss": "1.425", "ntokens": "125.305", "acc_total": "125.305", "n_correct": "93.495", "wer_total": "125.305", "n_error": "31.79", "ppl": "2.69", "accuracy": "74.614", "wer": "25.37", "wps": "76.3", "ups": "0.61", "wpb": "125.3", "bsz": "8", "num_updates": "25400", "lr": "4.9794e-05", "gnorm": "2.791", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "69494"}
	[2024-06-29 10:28:13,511][train_inner][INFO] - {"epoch": 2, "update": 1.697, "loss": "1.373", "ntokens": "127.065", "acc_total": "127.065", "n_correct": "94.99", "wer_total": "127.065", "n_error": "32.065", "ppl": "2.59", "accuracy": "74.757", "wer": "25.235", "wps": "77.4", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "25600", "lr": "4.83244e-05", "gnorm": "2.746", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "69823"}
	[2024-06-29 10:33:41,817][train_inner][INFO] - {"epoch": 2, "update": 1.711, "loss": "1.371", "ntokens": "127.065", "acc_total": "127.065", "n_correct": "95.335", "wer_total": "127.065", "n_error": "31.705", "ppl": "2.59", "accuracy": "75.029", "wer": "24.952", "wps": "77.4", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "25800", "lr": "4.68982e-05", "gnorm": "2.795", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "70151"}
	[2024-06-29 10:39:10,148][train_inner][INFO] - {"epoch": 2, "update": 1.724, "loss": "1.439", "ntokens": "126.995", "acc_total": "126.995", "n_correct": "94.25", "wer_total": "126.995", "n_error": "32.715", "ppl": "2.71", "accuracy": "74.216", "wer": "25.761", "wps": "77.4", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "26000", "lr": "4.55141e-05", "gnorm": "2.883", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "70479"}
	[2024-06-29 10:44:38,163][train_inner][INFO] - {"epoch": 2, "update": 1.737, "loss": "1.429", "ntokens": "126.31", "acc_total": "126.31", "n_correct": "93.555", "wer_total": "126.31", "n_error": "32.75", "ppl": "2.69", "accuracy": "74.068", "wer": "25.928", "wps": "77", "ups": "0.61", "wpb": "126.3", "bsz": "8", "num_updates": "26200", "lr": "4.41708e-05", "gnorm": "2.859", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "70807"}
	[2024-06-29 10:50:06,206][train_inner][INFO] - {"epoch": 2, "update": 1.75, "loss": "1.395", "ntokens": "127.37", "acc_total": "127.37", "n_correct": "94.835", "wer_total": "127.37", "n_error": "32.5", "ppl": "2.63", "accuracy": "74.456", "wer": "25.516", "wps": "77.7", "ups": "0.61", "wpb": "127.4", "bsz": "8", "num_updates": "26400", "lr": "4.28672e-05", "gnorm": "2.687", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "71135"}
	[2024-06-29 10:55:34,231][train_inner][INFO] - {"epoch": 2, "update": 1.764, "loss": "1.357", "ntokens": "126.69", "acc_total": "126.69", "n_correct": "96.405", "wer_total": "126.69", "n_error": "30.275", "ppl": "2.56", "accuracy": "76.095", "wer": "23.897", "wps": "77.2", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "26600", "lr": "4.16021e-05", "gnorm": "2.717", "loss_scale": "2048", "train_wall": "327", "gb_free": "7.1", "wall": "71463"}
	[2024-06-29 11:01:02,661][train_inner][INFO] - {"epoch": 2, "update": 1.777, "loss": "1.351", "ntokens": "126.605", "acc_total": "126.605", "n_correct": "96.74", "wer_total": "126.605", "n_error": "29.845", "ppl": "2.55", "accuracy": "76.411", "wer": "23.573", "wps": "77.1", "ups": "0.61", "wpb": "126.6", "bsz": "8", "num_updates": "26800", "lr": "4.03743e-05", "gnorm": "2.685", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "71792"}
	[2024-06-29 11:06:31,237][train_inner][INFO] - {"epoch": 2, "update": 1.79, "loss": "1.434", "ntokens": "127.595", "acc_total": "127.595", "n_correct": "95.505", "wer_total": "127.595", "n_error": "32.07", "ppl": "2.7", "accuracy": "74.85", "wer": "25.134", "wps": "77.7", "ups": "0.61", "wpb": "127.6", "bsz": "8", "num_updates": "27000", "lr": "3.91827e-05", "gnorm": "2.76", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "72120"}
	[2024-06-29 11:11:59,653][train_inner][INFO] - {"epoch": 2, "update": 1.803, "loss": "1.438", "ntokens": "127.94", "acc_total": "127.94", "n_correct": "96.12", "wer_total": "127.94", "n_error": "31.815", "ppl": "2.71", "accuracy": "75.129", "wer": "24.867", "wps": "77.9", "ups": "0.61", "wpb": "127.9", "bsz": "8", "num_updates": "27200", "lr": "3.80263e-05", "gnorm": "2.685", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "72449"}
	[2024-06-29 11:17:24,808][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2048.0
	[2024-06-29 11:17:29,728][train_inner][INFO] - {"epoch": 2, "update": 1.817, "loss": "1.428", "ntokens": "126.67", "acc_total": "126.67", "n_correct": "95.59", "wer_total": "126.67", "n_error": "31.06", "ppl": "2.69", "accuracy": "75.464", "wer": "24.52", "wps": "76.8", "ups": "0.61", "wpb": "126.7", "bsz": "8", "num_updates": "27400", "lr": "3.6904e-05", "gnorm": "2.728", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "72779"}
	[2024-06-29 11:20:14,042][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 12:02:09,222][valid][INFO] - {"epoch": 2, "valid_loss": "1.253", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "14.252", "valid_wer_total": "18.1585", "valid_n_error": "3.90417", "valid_ppl": "2.38", "valid_accuracy": "78.487", "valid_wer": "21.501", "valid_wps": "173.2", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "27500", "valid_best_accuracy": "78.487"}
	[2024-06-29 12:02:09,226][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 27500 updates
	[2024-06-29 12:02:09,227][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_27500.pt
	[2024-06-29 12:02:12,524][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_27500.pt
	[2024-06-29 12:02:17,921][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_27500.pt (epoch 2 @ 27500 updates, score 78.487) (writing took 8.694852707907557 seconds)
	[2024-06-29 12:05:02,203][train_inner][INFO] - {"epoch": 2, "update": 1.83, "loss": "1.4", "ntokens": "126.43", "acc_total": "126.43", "n_correct": "95.465", "wer_total": "126.43", "n_error": "30.94", "ppl": "2.64", "accuracy": "75.508", "wer": "24.472", "wps": "8.9", "ups": "0.07", "wpb": "126.4", "bsz": "8", "num_updates": "27600", "lr": "3.58149e-05", "gnorm": "2.811", "loss_scale": "2048", "train_wall": "328", "gb_free": "7.1", "wall": "75631"}
	[2024-06-29 12:10:31,832][train_inner][INFO] - {"epoch": 2, "update": 1.843, "loss": "1.42", "ntokens": "127.525", "acc_total": "127.525", "n_correct": "96.055", "wer_total": "127.525", "n_error": "31.45", "ppl": "2.68", "accuracy": "75.322", "wer": "24.662", "wps": "77.4", "ups": "0.61", "wpb": "127.5", "bsz": "8", "num_updates": "27800", "lr": "3.47579e-05", "gnorm": "2.794", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "75961"}
	[2024-06-29 12:12:33,591][fairseq.trainer][INFO] - NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1024.0
	[2024-06-29 12:16:02,669][train_inner][INFO] - {"epoch": 2, "update": 1.857, "loss": "1.396", "ntokens": "127.445", "acc_total": "127.445", "n_correct": "95.305", "wer_total": "127.445", "n_error": "32.13", "ppl": "2.63", "accuracy": "74.781", "wer": "25.211", "wps": "77", "ups": "0.6", "wpb": "127.4", "bsz": "8", "num_updates": "28000", "lr": "3.37321e-05", "gnorm": "2.744", "loss_scale": "1024", "train_wall": "330", "gb_free": "7.1", "wall": "76292"}
	[2024-06-29 12:21:32,275][train_inner][INFO] - {"epoch": 2, "update": 1.87, "loss": "1.374", "ntokens": "127.81", "acc_total": "127.81", "n_correct": "96.385", "wer_total": "127.81", "n_error": "31.415", "ppl": "2.59", "accuracy": "75.413", "wer": "24.579", "wps": "77.6", "ups": "0.61", "wpb": "127.8", "bsz": "8", "num_updates": "28200", "lr": "3.27365e-05", "gnorm": "2.68", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "76621"}
	[2024-06-29 12:27:02,045][train_inner][INFO] - {"epoch": 2, "update": 1.883, "loss": "1.387", "ntokens": "127.05", "acc_total": "127.05", "n_correct": "95.47", "wer_total": "127.05", "n_error": "31.55", "ppl": "2.62", "accuracy": "75.144", "wer": "24.833", "wps": "77.1", "ups": "0.61", "wpb": "127", "bsz": "8", "num_updates": "28400", "lr": "3.17704e-05", "gnorm": "2.803", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "76951"}
	[2024-06-29 12:32:31,911][train_inner][INFO] - {"epoch": 2, "update": 1.896, "loss": "1.403", "ntokens": "127.98", "acc_total": "127.98", "n_correct": "96.045", "wer_total": "127.98", "n_error": "31.91", "ppl": "2.65", "accuracy": "75.047", "wer": "24.934", "wps": "77.6", "ups": "0.61", "wpb": "128", "bsz": "8", "num_updates": "28600", "lr": "3.08327e-05", "gnorm": "2.776", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "77281"}
	[2024-06-29 12:38:01,791][train_inner][INFO] - {"epoch": 2, "update": 1.91, "loss": "1.354", "ntokens": "126.855", "acc_total": "126.855", "n_correct": "95.9", "wer_total": "126.855", "n_error": "30.935", "ppl": "2.56", "accuracy": "75.598", "wer": "24.386", "wps": "76.9", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "28800", "lr": "2.99228e-05", "gnorm": "2.706", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "77611"}
	[2024-06-29 12:43:31,782][train_inner][INFO] - {"epoch": 2, "update": 1.923, "loss": "1.358", "ntokens": "127.7", "acc_total": "127.7", "n_correct": "96.79", "wer_total": "127.7", "n_error": "30.88", "ppl": "2.56", "accuracy": "75.795", "wer": "24.182", "wps": "77.4", "ups": "0.61", "wpb": "127.7", "bsz": "8", "num_updates": "29000", "lr": "2.90397e-05", "gnorm": "2.693", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "77941"}
	[2024-06-29 12:49:01,701][train_inner][INFO] - {"epoch": 2, "update": 1.936, "loss": "1.398", "ntokens": "126.79", "acc_total": "126.79", "n_correct": "95.77", "wer_total": "126.79", "n_error": "31", "ppl": "2.64", "accuracy": "75.534", "wer": "24.45", "wps": "76.9", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "29200", "lr": "2.81826e-05", "gnorm": "2.747", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "78271"}
	[2024-06-29 12:54:31,644][train_inner][INFO] - {"epoch": 2, "update": 1.949, "loss": "1.389", "ntokens": "127.11", "acc_total": "127.11", "n_correct": "95.955", "wer_total": "127.11", "n_error": "31.125", "ppl": "2.62", "accuracy": "75.49", "wer": "24.487", "wps": "77", "ups": "0.61", "wpb": "127.1", "bsz": "8", "num_updates": "29400", "lr": "2.73509e-05", "gnorm": "2.722", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "78601"}
	[2024-06-29 13:00:01,405][train_inner][INFO] - {"epoch": 2, "update": 1.963, "loss": "1.359", "ntokens": "127.18", "acc_total": "127.18", "n_correct": "96.115", "wer_total": "127.18", "n_error": "31.045", "ppl": "2.56", "accuracy": "75.574", "wer": "24.41", "wps": "77.1", "ups": "0.61", "wpb": "127.2", "bsz": "8", "num_updates": "29600", "lr": "2.65436e-05", "gnorm": "2.801", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "78930"}
	[2024-06-29 13:05:31,132][train_inner][INFO] - {"epoch": 2, "update": 1.976, "loss": "1.396", "ntokens": "126.935", "acc_total": "126.935", "n_correct": "95.53", "wer_total": "126.935", "n_error": "31.385", "ppl": "2.63", "accuracy": "75.259", "wer": "24.725", "wps": "77", "ups": "0.61", "wpb": "126.9", "bsz": "8", "num_updates": "29800", "lr": "2.57603e-05", "gnorm": "2.741", "loss_scale": "1024", "train_wall": "329", "gb_free": "7.1", "wall": "79260"}
	[2024-06-29 13:11:01,125][train_inner][INFO] - {"epoch": 2, "update": 1.989, "loss": "1.391", "ntokens": "126.82", "acc_total": "126.82", "n_correct": "96.335", "wer_total": "126.82", "n_error": "30.455", "ppl": "2.62", "accuracy": "75.962", "wer": "24.014", "wps": "76.9", "ups": "0.61", "wpb": "126.8", "bsz": "8", "num_updates": "30000", "lr": "2.5e-05", "gnorm": "2.755", "loss_scale": "2048", "train_wall": "329", "gb_free": "7.1", "wall": "79590"}
	[2024-06-29 13:11:01,125][fairseq_cli.train][INFO] - Stopping training due to num_updates: 30000 >= max_update: 30000
	[2024-06-29 13:11:01,126][fairseq_cli.train][INFO] - begin validation on "valid" subset
	[2024-06-29 13:53:01,885][valid][INFO] - {"epoch": 2, "valid_loss": "1.236", "valid_ntokens": "18.1585", "valid_acc_total": "18.1585", "valid_n_correct": "14.2955", "valid_wer_total": "18.1585", "valid_n_error": "3.86115", "valid_ppl": "2.35", "valid_accuracy": "78.727", "valid_wer": "21.264", "valid_wps": "172.8", "valid_wpb": "18.2", "valid_bsz": "1", "valid_num_updates": "30000", "valid_best_accuracy": "78.727"}
	[2024-06-29 13:53:01,885][fairseq.checkpoint_utils][INFO] - Preparing to save checkpoint for epoch 2 @ 30000 updates
	[2024-06-29 13:53:01,886][fairseq.trainer][INFO] - Saving checkpoint to checkpoints/checkpoint_2_30000.pt
	[2024-06-29 13:53:05,120][fairseq.trainer][INFO] - Finished saving checkpoint to checkpoints/checkpoint_2_30000.pt
	[2024-06-29 13:53:10,356][fairseq.checkpoint_utils][INFO] - Saved checkpoint checkpoints/checkpoint_2_30000.pt (epoch 2 @ 30000 updates, score 78.727) (writing took 8.470569632947445 seconds)
	[2024-06-29 13:53:10,387][fairseq_cli.train][INFO] - end of epoch 2 (average epoch stats below)
	[2024-06-29 13:53:10,389][train][INFO] - {"epoch": 2, "train_loss": "1.487", "train_ntokens": "126.907", "train_acc_total": "126.907", "train_n_correct": "93.677", "train_wer_total": "126.907", "train_n_error": "33.2017", "train_ppl": "2.8", "train_accuracy": "73.816", "train_wer": "26.162", "train_wps": "47.8", "train_ups": "0.38", "train_wpb": "126.9", "train_bsz": "8", "train_num_updates": "30000", "train_lr": "2.5e-05", "train_gnorm": "2.886", "train_loss_scale": "2048", "train_train_wall": "24463", "train_gb_free": "7.1", "train_wall": "82119"}
	[2024-06-29 13:53:10,389][fairseq_cli.train][INFO] - done training in 82118.2 seconds