LatentSync / configs /syncnet /syncnet_25_pixel.yaml
fffiloni's picture
Migrated from GitHub
3650c12 verified
model:
audio_encoder: # input (1, 80, 80)
in_channels: 1
block_out_channels: [64, 128, 256, 256, 512, 1024]
downsample_factors: [2, 2, 2, 2, 2, 2]
dropout: 0.0
visual_encoder: # input (75, 128, 256)
in_channels: 75
block_out_channels: [128, 128, 256, 256, 512, 512, 1024, 1024]
downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
dropout: 0.0
ckpt:
resume_ckpt_path: ""
inference_ckpt_path: ""
save_ckpt_steps: 2500
data:
train_output_dir: debug/syncnet
num_val_samples: 2048
batch_size: 64 # 64
num_workers: 11 # 11
latent_space: false
num_frames: 25
resolution: 256
train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_vox_avatars_ads_affine.txt
# /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_voxceleb_avatars_affine.txt
train_data_dir: ""
val_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/vox_affine_val.txt
# /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/voxceleb_val.txt
val_data_dir: ""
audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel
lower_half: true
pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
audio_sample_rate: 16000
video_fps: 25
optimizer:
lr: 1e-5
max_grad_norm: 1.0
run:
max_train_steps: 10000000
mixed_precision_training: true
seed: 42