model: audio_encoder: # input (1, 80, 80) in_channels: 1 block_out_channels: [64, 128, 256, 256, 512, 1024] downsample_factors: [2, 2, 2, 2, 2, 2] dropout: 0.0 visual_encoder: # input (75, 128, 256) in_channels: 75 block_out_channels: [128, 128, 256, 256, 512, 512, 1024, 1024] downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2] dropout: 0.0 ckpt: resume_ckpt_path: "" inference_ckpt_path: "" save_ckpt_steps: 2500 data: train_output_dir: debug/syncnet num_val_samples: 2048 batch_size: 64 # 64 num_workers: 11 # 11 latent_space: false num_frames: 25 resolution: 256 train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_vox_avatars_ads_affine.txt # /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_voxceleb_avatars_affine.txt train_data_dir: "" val_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/vox_affine_val.txt # /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/voxceleb_val.txt val_data_dir: "" audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel lower_half: true pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53 audio_sample_rate: 16000 video_fps: 25 optimizer: lr: 1e-5 max_grad_norm: 1.0 run: max_train_steps: 10000000 mixed_precision_training: true seed: 42