Francke commited on
Commit
aad5337
·
1 Parent(s): 089bec4
configs/audio.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ num_mels: 80 # Number of mel-spectrogram channels and local conditioning dimensionality
3
+ rescale: true # Whether to rescale audio prior to preprocessing
4
+ rescaling_max: 0.9 # Rescaling value
5
+ use_lws:
6
+ false # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
7
+ # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
8
+ # Does not work if n_ffit is not multiple of hop_size!!
9
+ n_fft: 800 # Extra window size is filled with 0 paddings to match this parameter
10
+ hop_size: 200 # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
11
+ win_size: 800 # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
12
+ sample_rate: 16000 # 16000Hz (corresponding to librispeech) (sox --i <filename>)
13
+ frame_shift_ms: null
14
+ signal_normalization: true
15
+ allow_clipping_in_normalization: true
16
+ symmetric_mels: true
17
+ max_abs_value: 4.0
18
+ preemphasize: true # whether to apply filter
19
+ preemphasis: 0.97 # filter coefficient.
20
+ min_level_db: -100
21
+ ref_level_db: 20
22
+ fmin: 55
23
+ fmax: 7600
configs/scheduler_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.6.0.dev0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "num_train_timesteps": 1000,
9
+ "set_alpha_to_one": false,
10
+ "steps_offset": 1,
11
+ "trained_betas": null,
12
+ "skip_prk_steps": true
13
+ }
configs/syncnet/syncnet_16_latent.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ audio_encoder: # input (1, 80, 52)
3
+ in_channels: 1
4
+ block_out_channels: [32, 64, 128, 256, 512, 1024]
5
+ downsample_factors: [[2, 1], 2, 2, 2, 2, [2, 3]]
6
+ attn_blocks: [0, 0, 0, 0, 0, 0]
7
+ dropout: 0.0
8
+ visual_encoder: # input (64, 32, 32)
9
+ in_channels: 64
10
+ block_out_channels: [64, 128, 256, 256, 512, 1024]
11
+ downsample_factors: [2, 2, 2, 1, 2, 2]
12
+ attn_blocks: [0, 0, 0, 0, 0, 0]
13
+ dropout: 0.0
14
+
15
+ ckpt:
16
+ resume_ckpt_path: ""
17
+ inference_ckpt_path: ""
18
+ save_ckpt_steps: 2500
19
+
20
+ data:
21
+ train_output_dir: output/syncnet
22
+ num_val_samples: 1200
23
+ batch_size: 120 # 40
24
+ num_workers: 11 # 11
25
+ latent_space: true
26
+ num_frames: 16
27
+ resolution: 256
28
+ train_fileslist: ""
29
+ train_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/train
30
+ val_fileslist: ""
31
+ val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
32
+ audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
33
+ lower_half: false
34
+ pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
35
+ audio_sample_rate: 16000
36
+ video_fps: 25
37
+
38
+ optimizer:
39
+ lr: 1e-5
40
+ max_grad_norm: 1.0
41
+
42
+ run:
43
+ max_train_steps: 10000000
44
+ validation_steps: 2500
45
+ mixed_precision_training: true
46
+ seed: 42
configs/syncnet/syncnet_16_pixel.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ audio_encoder: # input (1, 80, 52)
3
+ in_channels: 1
4
+ block_out_channels: [32, 64, 128, 256, 512, 1024, 2048]
5
+ downsample_factors: [[2, 1], 2, 2, 1, 2, 2, [2, 3]]
6
+ attn_blocks: [0, 0, 0, 0, 0, 0, 0]
7
+ dropout: 0.0
8
+ visual_encoder: # input (48, 128, 256)
9
+ in_channels: 48
10
+ block_out_channels: [64, 128, 256, 256, 512, 1024, 2048, 2048]
11
+ downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
12
+ attn_blocks: [0, 0, 0, 0, 0, 0, 0, 0]
13
+ dropout: 0.0
14
+
15
+ ckpt:
16
+ resume_ckpt_path: ""
17
+ inference_ckpt_path: checkpoints/latentsync_syncnet.pt
18
+ save_ckpt_steps: 2500
19
+
20
+ data:
21
+ train_output_dir: debug/syncnet
22
+ num_val_samples: 2048
23
+ batch_size: 128 # 128
24
+ num_workers: 11 # 11
25
+ latent_space: false
26
+ num_frames: 16
27
+ resolution: 256
28
+ train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
29
+ train_data_dir: ""
30
+ val_fileslist: ""
31
+ val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
32
+ audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
33
+ lower_half: true
34
+ audio_sample_rate: 16000
35
+ video_fps: 25
36
+
37
+ optimizer:
38
+ lr: 1e-5
39
+ max_grad_norm: 1.0
40
+
41
+ run:
42
+ max_train_steps: 10000000
43
+ validation_steps: 2500
44
+ mixed_precision_training: true
45
+ seed: 42
configs/syncnet/syncnet_25_pixel.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ audio_encoder: # input (1, 80, 80)
3
+ in_channels: 1
4
+ block_out_channels: [64, 128, 256, 256, 512, 1024]
5
+ downsample_factors: [2, 2, 2, 2, 2, 2]
6
+ dropout: 0.0
7
+ visual_encoder: # input (75, 128, 256)
8
+ in_channels: 75
9
+ block_out_channels: [128, 128, 256, 256, 512, 512, 1024, 1024]
10
+ downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
11
+ dropout: 0.0
12
+
13
+ ckpt:
14
+ resume_ckpt_path: ""
15
+ inference_ckpt_path: ""
16
+ save_ckpt_steps: 2500
17
+
18
+ data:
19
+ train_output_dir: debug/syncnet
20
+ num_val_samples: 2048
21
+ batch_size: 64 # 64
22
+ num_workers: 11 # 11
23
+ latent_space: false
24
+ num_frames: 25
25
+ resolution: 256
26
+ train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_vox_avatars_ads_affine.txt
27
+ # /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_voxceleb_avatars_affine.txt
28
+ train_data_dir: ""
29
+ val_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/vox_affine_val.txt
30
+ # /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/voxceleb_val.txt
31
+ val_data_dir: ""
32
+ audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel
33
+ lower_half: true
34
+ pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
35
+ audio_sample_rate: 16000
36
+ video_fps: 25
37
+
38
+ optimizer:
39
+ lr: 1e-5
40
+ max_grad_norm: 1.0
41
+
42
+ run:
43
+ max_train_steps: 10000000
44
+ mixed_precision_training: true
45
+ seed: 42
configs/unet/first_stage.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ syncnet_config_path: configs/syncnet/syncnet_16_pixel.yaml
3
+ train_output_dir: debug/unet
4
+ train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
5
+ train_data_dir: ""
6
+ audio_embeds_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/whisper_new
7
+ audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
8
+
9
+ val_video_path: assets/demo1_video.mp4
10
+ val_audio_path: assets/demo1_audio.wav
11
+ batch_size: 8 # 8
12
+ num_workers: 11 # 11
13
+ num_frames: 16
14
+ resolution: 256
15
+ mask: fix_mask
16
+ audio_sample_rate: 16000
17
+ video_fps: 25
18
+
19
+ ckpt:
20
+ resume_ckpt_path: checkpoints/latentsync_unet.pt
21
+ save_ckpt_steps: 5000
22
+
23
+ run:
24
+ pixel_space_supervise: false
25
+ use_syncnet: false
26
+ sync_loss_weight: 0.05 # 1/283
27
+ perceptual_loss_weight: 0.1 # 0.1
28
+ recon_loss_weight: 1 # 1
29
+ guidance_scale: 1.0 # 1.5 or 1.0
30
+ trepa_loss_weight: 10
31
+ inference_steps: 20
32
+ seed: 1247
33
+ use_mixed_noise: true
34
+ mixed_noise_alpha: 1 # 1
35
+ mixed_precision_training: true
36
+ enable_gradient_checkpointing: false
37
+ enable_xformers_memory_efficient_attention: true
38
+ max_train_steps: 10000000
39
+ max_train_epochs: -1
40
+
41
+ optimizer:
42
+ lr: 1e-5
43
+ scale_lr: false
44
+ max_grad_norm: 1.0
45
+ lr_scheduler: constant
46
+ lr_warmup_steps: 0
47
+
48
+ model:
49
+ act_fn: silu
50
+ add_audio_layer: true
51
+ custom_audio_layer: false
52
+ audio_condition_method: cross_attn # Choose between [cross_attn, group_norm]
53
+ attention_head_dim: 8
54
+ block_out_channels: [320, 640, 1280, 1280]
55
+ center_input_sample: false
56
+ cross_attention_dim: 384
57
+ down_block_types:
58
+ [
59
+ "CrossAttnDownBlock3D",
60
+ "CrossAttnDownBlock3D",
61
+ "CrossAttnDownBlock3D",
62
+ "DownBlock3D",
63
+ ]
64
+ mid_block_type: UNetMidBlock3DCrossAttn
65
+ up_block_types:
66
+ [
67
+ "UpBlock3D",
68
+ "CrossAttnUpBlock3D",
69
+ "CrossAttnUpBlock3D",
70
+ "CrossAttnUpBlock3D",
71
+ ]
72
+ downsample_padding: 1
73
+ flip_sin_to_cos: true
74
+ freq_shift: 0
75
+ in_channels: 13 # 49
76
+ layers_per_block: 2
77
+ mid_block_scale_factor: 1
78
+ norm_eps: 1e-5
79
+ norm_num_groups: 32
80
+ out_channels: 4 # 16
81
+ sample_size: 64
82
+ resnet_time_scale_shift: default # Choose between [default, scale_shift]
83
+ unet_use_cross_frame_attention: false
84
+ unet_use_temporal_attention: false
85
+
86
+ # Actually we don't use the motion module in the final version of LatentSync
87
+ # When we started the project, we used the codebase of AnimateDiff and tried motion module, the results are poor
88
+ # We decied to leave the code here for possible future usage
89
+ use_motion_module: false
90
+ motion_module_resolutions: [1, 2, 4, 8]
91
+ motion_module_mid_block: false
92
+ motion_module_decoder_only: false
93
+ motion_module_type: Vanilla
94
+ motion_module_kwargs:
95
+ num_attention_heads: 8
96
+ num_transformer_block: 1
97
+ attention_block_types:
98
+ - Temporal_Self
99
+ - Temporal_Self
100
+ temporal_position_encoding: true
101
+ temporal_position_encoding_max_len: 16
102
+ temporal_attention_dim_div: 1
103
+ zero_initialize: true
configs/unet/second_stage.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ syncnet_config_path: configs/syncnet/syncnet_16_pixel.yaml
3
+ train_output_dir: debug/unet
4
+ train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
5
+ train_data_dir: ""
6
+ audio_embeds_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/whisper_new
7
+ audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
8
+
9
+ val_video_path: assets/demo1_video.mp4
10
+ val_audio_path: assets/demo1_audio.wav
11
+ batch_size: 2 # 8
12
+ num_workers: 11 # 11
13
+ num_frames: 16
14
+ resolution: 256
15
+ mask: fix_mask
16
+ audio_sample_rate: 16000
17
+ video_fps: 25
18
+
19
+ ckpt:
20
+ resume_ckpt_path: checkpoints/latentsync_unet.pt
21
+ save_ckpt_steps: 5000
22
+
23
+ run:
24
+ pixel_space_supervise: true
25
+ use_syncnet: true
26
+ sync_loss_weight: 0.05 # 1/283
27
+ perceptual_loss_weight: 0.1 # 0.1
28
+ recon_loss_weight: 1 # 1
29
+ guidance_scale: 1.0 # 1.5 or 1.0
30
+ trepa_loss_weight: 10
31
+ inference_steps: 20
32
+ seed: 1247
33
+ use_mixed_noise: true
34
+ mixed_noise_alpha: 1 # 1
35
+ mixed_precision_training: true
36
+ enable_gradient_checkpointing: false
37
+ enable_xformers_memory_efficient_attention: true
38
+ max_train_steps: 10000000
39
+ max_train_epochs: -1
40
+
41
+ optimizer:
42
+ lr: 1e-5
43
+ scale_lr: false
44
+ max_grad_norm: 1.0
45
+ lr_scheduler: constant
46
+ lr_warmup_steps: 0
47
+
48
+ model:
49
+ act_fn: silu
50
+ add_audio_layer: true
51
+ custom_audio_layer: false
52
+ audio_condition_method: cross_attn # Choose between [cross_attn, group_norm]
53
+ attention_head_dim: 8
54
+ block_out_channels: [320, 640, 1280, 1280]
55
+ center_input_sample: false
56
+ cross_attention_dim: 384
57
+ down_block_types:
58
+ [
59
+ "CrossAttnDownBlock3D",
60
+ "CrossAttnDownBlock3D",
61
+ "CrossAttnDownBlock3D",
62
+ "DownBlock3D",
63
+ ]
64
+ mid_block_type: UNetMidBlock3DCrossAttn
65
+ up_block_types:
66
+ [
67
+ "UpBlock3D",
68
+ "CrossAttnUpBlock3D",
69
+ "CrossAttnUpBlock3D",
70
+ "CrossAttnUpBlock3D",
71
+ ]
72
+ downsample_padding: 1
73
+ flip_sin_to_cos: true
74
+ freq_shift: 0
75
+ in_channels: 13 # 49
76
+ layers_per_block: 2
77
+ mid_block_scale_factor: 1
78
+ norm_eps: 1e-5
79
+ norm_num_groups: 32
80
+ out_channels: 4 # 16
81
+ sample_size: 64
82
+ resnet_time_scale_shift: default # Choose between [default, scale_shift]
83
+ unet_use_cross_frame_attention: false
84
+ unet_use_temporal_attention: false
85
+
86
+ # Actually we don't use the motion module in the final version of LatentSync
87
+ # When we started the project, we used the codebase of AnimateDiff and tried motion module, the results are poor
88
+ # We decied to leave the code here for possible future usage
89
+ use_motion_module: false
90
+ motion_module_resolutions: [1, 2, 4, 8]
91
+ motion_module_mid_block: false
92
+ motion_module_decoder_only: false
93
+ motion_module_type: Vanilla
94
+ motion_module_kwargs:
95
+ num_attention_heads: 8
96
+ num_transformer_block: 1
97
+ attention_block_types:
98
+ - Temporal_Self
99
+ - Temporal_Self
100
+ temporal_position_encoding: true
101
+ temporal_position_encoding_max_len: 16
102
+ temporal_attention_dim_div: 1
103
+ zero_initialize: true
eval/detectors/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Face detector
2
+
3
+ This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
eval/detectors/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .s3fd import S3FD
eval/detectors/s3fd/__init__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import numpy as np
3
+ import cv2
4
+ import torch
5
+ from torchvision import transforms
6
+ from .nets import S3FDNet
7
+ from .box_utils import nms_
8
+
9
+ PATH_WEIGHT = 'checkpoints/auxiliary/sfd_face.pth'
10
+ img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
11
+
12
+
13
+ class S3FD():
14
+
15
+ def __init__(self, device='cuda'):
16
+
17
+ tstamp = time.time()
18
+ self.device = device
19
+
20
+ print('[S3FD] loading with', self.device)
21
+ self.net = S3FDNet(device=self.device).to(self.device)
22
+ state_dict = torch.load(PATH_WEIGHT, map_location=self.device)
23
+ self.net.load_state_dict(state_dict)
24
+ self.net.eval()
25
+ print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
26
+
27
+ def detect_faces(self, image, conf_th=0.8, scales=[1]):
28
+
29
+ w, h = image.shape[1], image.shape[0]
30
+
31
+ bboxes = np.empty(shape=(0, 5))
32
+
33
+ with torch.no_grad():
34
+ for s in scales:
35
+ scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
36
+
37
+ scaled_img = np.swapaxes(scaled_img, 1, 2)
38
+ scaled_img = np.swapaxes(scaled_img, 1, 0)
39
+ scaled_img = scaled_img[[2, 1, 0], :, :]
40
+ scaled_img = scaled_img.astype('float32')
41
+ scaled_img -= img_mean
42
+ scaled_img = scaled_img[[2, 1, 0], :, :]
43
+ x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
44
+ y = self.net(x)
45
+
46
+ detections = y.data
47
+ scale = torch.Tensor([w, h, w, h])
48
+
49
+ for i in range(detections.size(1)):
50
+ j = 0
51
+ while detections[0, i, j, 0] > conf_th:
52
+ score = detections[0, i, j, 0]
53
+ pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
54
+ bbox = (pt[0], pt[1], pt[2], pt[3], score)
55
+ bboxes = np.vstack((bboxes, bbox))
56
+ j += 1
57
+
58
+ keep = nms_(bboxes, 0.1)
59
+ bboxes = bboxes[keep]
60
+
61
+ return bboxes
eval/detectors/s3fd/box_utils.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from itertools import product as product
3
+ import torch
4
+ from torch.autograd import Function
5
+ import warnings
6
+
7
+
8
+ def nms_(dets, thresh):
9
+ """
10
+ Courtesy of Ross Girshick
11
+ [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
12
+ """
13
+ x1 = dets[:, 0]
14
+ y1 = dets[:, 1]
15
+ x2 = dets[:, 2]
16
+ y2 = dets[:, 3]
17
+ scores = dets[:, 4]
18
+
19
+ areas = (x2 - x1) * (y2 - y1)
20
+ order = scores.argsort()[::-1]
21
+
22
+ keep = []
23
+ while order.size > 0:
24
+ i = order[0]
25
+ keep.append(int(i))
26
+ xx1 = np.maximum(x1[i], x1[order[1:]])
27
+ yy1 = np.maximum(y1[i], y1[order[1:]])
28
+ xx2 = np.minimum(x2[i], x2[order[1:]])
29
+ yy2 = np.minimum(y2[i], y2[order[1:]])
30
+
31
+ w = np.maximum(0.0, xx2 - xx1)
32
+ h = np.maximum(0.0, yy2 - yy1)
33
+ inter = w * h
34
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
35
+
36
+ inds = np.where(ovr <= thresh)[0]
37
+ order = order[inds + 1]
38
+
39
+ return np.array(keep).astype(np.int32)
40
+
41
+
42
+ def decode(loc, priors, variances):
43
+ """Decode locations from predictions using priors to undo
44
+ the encoding we did for offset regression at train time.
45
+ Args:
46
+ loc (tensor): location predictions for loc layers,
47
+ Shape: [num_priors,4]
48
+ priors (tensor): Prior boxes in center-offset form.
49
+ Shape: [num_priors,4].
50
+ variances: (list[float]) Variances of priorboxes
51
+ Return:
52
+ decoded bounding box predictions
53
+ """
54
+
55
+ boxes = torch.cat((
56
+ priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
57
+ priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
58
+ boxes[:, :2] -= boxes[:, 2:] / 2
59
+ boxes[:, 2:] += boxes[:, :2]
60
+ return boxes
61
+
62
+
63
+ def nms(boxes, scores, overlap=0.5, top_k=200):
64
+ """Apply non-maximum suppression at test time to avoid detecting too many
65
+ overlapping bounding boxes for a given object.
66
+ Args:
67
+ boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
68
+ scores: (tensor) The class predscores for the img, Shape:[num_priors].
69
+ overlap: (float) The overlap thresh for suppressing unnecessary boxes.
70
+ top_k: (int) The Maximum number of box preds to consider.
71
+ Return:
72
+ The indices of the kept boxes with respect to num_priors.
73
+ """
74
+
75
+ keep = scores.new(scores.size(0)).zero_().long()
76
+ if boxes.numel() == 0:
77
+ return keep, 0
78
+ x1 = boxes[:, 0]
79
+ y1 = boxes[:, 1]
80
+ x2 = boxes[:, 2]
81
+ y2 = boxes[:, 3]
82
+ area = torch.mul(x2 - x1, y2 - y1)
83
+ v, idx = scores.sort(0) # sort in ascending order
84
+ # I = I[v >= 0.01]
85
+ idx = idx[-top_k:] # indices of the top-k largest vals
86
+ xx1 = boxes.new()
87
+ yy1 = boxes.new()
88
+ xx2 = boxes.new()
89
+ yy2 = boxes.new()
90
+ w = boxes.new()
91
+ h = boxes.new()
92
+
93
+ # keep = torch.Tensor()
94
+ count = 0
95
+ while idx.numel() > 0:
96
+ i = idx[-1] # index of current largest val
97
+ # keep.append(i)
98
+ keep[count] = i
99
+ count += 1
100
+ if idx.size(0) == 1:
101
+ break
102
+ idx = idx[:-1] # remove kept element from view
103
+ # load bboxes of next highest vals
104
+ with warnings.catch_warnings():
105
+ # Ignore UserWarning within this block
106
+ warnings.simplefilter("ignore", category=UserWarning)
107
+ torch.index_select(x1, 0, idx, out=xx1)
108
+ torch.index_select(y1, 0, idx, out=yy1)
109
+ torch.index_select(x2, 0, idx, out=xx2)
110
+ torch.index_select(y2, 0, idx, out=yy2)
111
+ # store element-wise max with next highest score
112
+ xx1 = torch.clamp(xx1, min=x1[i])
113
+ yy1 = torch.clamp(yy1, min=y1[i])
114
+ xx2 = torch.clamp(xx2, max=x2[i])
115
+ yy2 = torch.clamp(yy2, max=y2[i])
116
+ w.resize_as_(xx2)
117
+ h.resize_as_(yy2)
118
+ w = xx2 - xx1
119
+ h = yy2 - yy1
120
+ # check sizes of xx1 and xx2.. after each iteration
121
+ w = torch.clamp(w, min=0.0)
122
+ h = torch.clamp(h, min=0.0)
123
+ inter = w * h
124
+ # IoU = i / (area(a) + area(b) - i)
125
+ rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
126
+ union = (rem_areas - inter) + area[i]
127
+ IoU = inter / union # store result in iou
128
+ # keep only elements with an IoU <= overlap
129
+ idx = idx[IoU.le(overlap)]
130
+ return keep, count
131
+
132
+
133
+ class Detect(object):
134
+
135
+ def __init__(self, num_classes=2,
136
+ top_k=750, nms_thresh=0.3, conf_thresh=0.05,
137
+ variance=[0.1, 0.2], nms_top_k=5000):
138
+
139
+ self.num_classes = num_classes
140
+ self.top_k = top_k
141
+ self.nms_thresh = nms_thresh
142
+ self.conf_thresh = conf_thresh
143
+ self.variance = variance
144
+ self.nms_top_k = nms_top_k
145
+
146
+ def forward(self, loc_data, conf_data, prior_data):
147
+
148
+ num = loc_data.size(0)
149
+ num_priors = prior_data.size(0)
150
+
151
+ conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
152
+ batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
153
+ batch_priors = batch_priors.contiguous().view(-1, 4)
154
+
155
+ decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
156
+ decoded_boxes = decoded_boxes.view(num, num_priors, 4)
157
+
158
+ output = torch.zeros(num, self.num_classes, self.top_k, 5)
159
+
160
+ for i in range(num):
161
+ boxes = decoded_boxes[i].clone()
162
+ conf_scores = conf_preds[i].clone()
163
+
164
+ for cl in range(1, self.num_classes):
165
+ c_mask = conf_scores[cl].gt(self.conf_thresh)
166
+ scores = conf_scores[cl][c_mask]
167
+
168
+ if scores.dim() == 0:
169
+ continue
170
+ l_mask = c_mask.unsqueeze(1).expand_as(boxes)
171
+ boxes_ = boxes[l_mask].view(-1, 4)
172
+ ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
173
+ count = count if count < self.top_k else self.top_k
174
+
175
+ output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
176
+
177
+ return output
178
+
179
+
180
+ class PriorBox(object):
181
+
182
+ def __init__(self, input_size, feature_maps,
183
+ variance=[0.1, 0.2],
184
+ min_sizes=[16, 32, 64, 128, 256, 512],
185
+ steps=[4, 8, 16, 32, 64, 128],
186
+ clip=False):
187
+
188
+ super(PriorBox, self).__init__()
189
+
190
+ self.imh = input_size[0]
191
+ self.imw = input_size[1]
192
+ self.feature_maps = feature_maps
193
+
194
+ self.variance = variance
195
+ self.min_sizes = min_sizes
196
+ self.steps = steps
197
+ self.clip = clip
198
+
199
+ def forward(self):
200
+ mean = []
201
+ for k, fmap in enumerate(self.feature_maps):
202
+ feath = fmap[0]
203
+ featw = fmap[1]
204
+ for i, j in product(range(feath), range(featw)):
205
+ f_kw = self.imw / self.steps[k]
206
+ f_kh = self.imh / self.steps[k]
207
+
208
+ cx = (j + 0.5) / f_kw
209
+ cy = (i + 0.5) / f_kh
210
+
211
+ s_kw = self.min_sizes[k] / self.imw
212
+ s_kh = self.min_sizes[k] / self.imh
213
+
214
+ mean += [cx, cy, s_kw, s_kh]
215
+
216
+ output = torch.FloatTensor(mean).view(-1, 4)
217
+
218
+ if self.clip:
219
+ output.clamp_(max=1, min=0)
220
+
221
+ return output
eval/detectors/s3fd/nets.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import torch.nn.init as init
5
+ from .box_utils import Detect, PriorBox
6
+
7
+
8
+ class L2Norm(nn.Module):
9
+
10
+ def __init__(self, n_channels, scale):
11
+ super(L2Norm, self).__init__()
12
+ self.n_channels = n_channels
13
+ self.gamma = scale or None
14
+ self.eps = 1e-10
15
+ self.weight = nn.Parameter(torch.Tensor(self.n_channels))
16
+ self.reset_parameters()
17
+
18
+ def reset_parameters(self):
19
+ init.constant_(self.weight, self.gamma)
20
+
21
+ def forward(self, x):
22
+ norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
23
+ x = torch.div(x, norm)
24
+ out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
25
+ return out
26
+
27
+
28
+ class S3FDNet(nn.Module):
29
+
30
+ def __init__(self, device='cuda'):
31
+ super(S3FDNet, self).__init__()
32
+ self.device = device
33
+
34
+ self.vgg = nn.ModuleList([
35
+ nn.Conv2d(3, 64, 3, 1, padding=1),
36
+ nn.ReLU(inplace=True),
37
+ nn.Conv2d(64, 64, 3, 1, padding=1),
38
+ nn.ReLU(inplace=True),
39
+ nn.MaxPool2d(2, 2),
40
+
41
+ nn.Conv2d(64, 128, 3, 1, padding=1),
42
+ nn.ReLU(inplace=True),
43
+ nn.Conv2d(128, 128, 3, 1, padding=1),
44
+ nn.ReLU(inplace=True),
45
+ nn.MaxPool2d(2, 2),
46
+
47
+ nn.Conv2d(128, 256, 3, 1, padding=1),
48
+ nn.ReLU(inplace=True),
49
+ nn.Conv2d(256, 256, 3, 1, padding=1),
50
+ nn.ReLU(inplace=True),
51
+ nn.Conv2d(256, 256, 3, 1, padding=1),
52
+ nn.ReLU(inplace=True),
53
+ nn.MaxPool2d(2, 2, ceil_mode=True),
54
+
55
+ nn.Conv2d(256, 512, 3, 1, padding=1),
56
+ nn.ReLU(inplace=True),
57
+ nn.Conv2d(512, 512, 3, 1, padding=1),
58
+ nn.ReLU(inplace=True),
59
+ nn.Conv2d(512, 512, 3, 1, padding=1),
60
+ nn.ReLU(inplace=True),
61
+ nn.MaxPool2d(2, 2),
62
+
63
+ nn.Conv2d(512, 512, 3, 1, padding=1),
64
+ nn.ReLU(inplace=True),
65
+ nn.Conv2d(512, 512, 3, 1, padding=1),
66
+ nn.ReLU(inplace=True),
67
+ nn.Conv2d(512, 512, 3, 1, padding=1),
68
+ nn.ReLU(inplace=True),
69
+ nn.MaxPool2d(2, 2),
70
+
71
+ nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
72
+ nn.ReLU(inplace=True),
73
+ nn.Conv2d(1024, 1024, 1, 1),
74
+ nn.ReLU(inplace=True),
75
+ ])
76
+
77
+ self.L2Norm3_3 = L2Norm(256, 10)
78
+ self.L2Norm4_3 = L2Norm(512, 8)
79
+ self.L2Norm5_3 = L2Norm(512, 5)
80
+
81
+ self.extras = nn.ModuleList([
82
+ nn.Conv2d(1024, 256, 1, 1),
83
+ nn.Conv2d(256, 512, 3, 2, padding=1),
84
+ nn.Conv2d(512, 128, 1, 1),
85
+ nn.Conv2d(128, 256, 3, 2, padding=1),
86
+ ])
87
+
88
+ self.loc = nn.ModuleList([
89
+ nn.Conv2d(256, 4, 3, 1, padding=1),
90
+ nn.Conv2d(512, 4, 3, 1, padding=1),
91
+ nn.Conv2d(512, 4, 3, 1, padding=1),
92
+ nn.Conv2d(1024, 4, 3, 1, padding=1),
93
+ nn.Conv2d(512, 4, 3, 1, padding=1),
94
+ nn.Conv2d(256, 4, 3, 1, padding=1),
95
+ ])
96
+
97
+ self.conf = nn.ModuleList([
98
+ nn.Conv2d(256, 4, 3, 1, padding=1),
99
+ nn.Conv2d(512, 2, 3, 1, padding=1),
100
+ nn.Conv2d(512, 2, 3, 1, padding=1),
101
+ nn.Conv2d(1024, 2, 3, 1, padding=1),
102
+ nn.Conv2d(512, 2, 3, 1, padding=1),
103
+ nn.Conv2d(256, 2, 3, 1, padding=1),
104
+ ])
105
+
106
+ self.softmax = nn.Softmax(dim=-1)
107
+ self.detect = Detect()
108
+
109
+ def forward(self, x):
110
+ size = x.size()[2:]
111
+ sources = list()
112
+ loc = list()
113
+ conf = list()
114
+
115
+ for k in range(16):
116
+ x = self.vgg[k](x)
117
+ s = self.L2Norm3_3(x)
118
+ sources.append(s)
119
+
120
+ for k in range(16, 23):
121
+ x = self.vgg[k](x)
122
+ s = self.L2Norm4_3(x)
123
+ sources.append(s)
124
+
125
+ for k in range(23, 30):
126
+ x = self.vgg[k](x)
127
+ s = self.L2Norm5_3(x)
128
+ sources.append(s)
129
+
130
+ for k in range(30, len(self.vgg)):
131
+ x = self.vgg[k](x)
132
+ sources.append(x)
133
+
134
+ # apply extra layers and cache source layer outputs
135
+ for k, v in enumerate(self.extras):
136
+ x = F.relu(v(x), inplace=True)
137
+ if k % 2 == 1:
138
+ sources.append(x)
139
+
140
+ # apply multibox head to source layers
141
+ loc_x = self.loc[0](sources[0])
142
+ conf_x = self.conf[0](sources[0])
143
+
144
+ max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
145
+ conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
146
+
147
+ loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
148
+ conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
149
+
150
+ for i in range(1, len(sources)):
151
+ x = sources[i]
152
+ conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
153
+ loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
154
+
155
+ features_maps = []
156
+ for i in range(len(loc)):
157
+ feat = []
158
+ feat += [loc[i].size(1), loc[i].size(2)]
159
+ features_maps += [feat]
160
+
161
+ loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
162
+ conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
163
+
164
+ with torch.no_grad():
165
+ self.priorbox = PriorBox(size, features_maps)
166
+ self.priors = self.priorbox.forward()
167
+
168
+ output = self.detect.forward(
169
+ loc.view(loc.size(0), -1, 4),
170
+ self.softmax(conf.view(conf.size(0), -1, 2)),
171
+ self.priors.type(type(x.data)).to(self.device)
172
+ )
173
+
174
+ return output
eval/draw_syncnet_lines.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import torch
16
+ import matplotlib.pyplot as plt
17
+
18
+
19
+ class Chart:
20
+ def __init__(self):
21
+ self.loss_list = []
22
+
23
+ def add_ckpt(self, ckpt_path, line_name):
24
+ ckpt = torch.load(ckpt_path, map_location="cpu")
25
+ train_step_list = ckpt["train_step_list"]
26
+ train_loss_list = ckpt["train_loss_list"]
27
+ val_step_list = ckpt["val_step_list"]
28
+ val_loss_list = ckpt["val_loss_list"]
29
+ val_step_list = [val_step_list[0]] + val_step_list[4::5]
30
+ val_loss_list = [val_loss_list[0]] + val_loss_list[4::5]
31
+ self.loss_list.append((line_name, train_step_list, train_loss_list, val_step_list, val_loss_list))
32
+
33
+ def draw(self, save_path, plot_val=True):
34
+ # Global settings
35
+ plt.rcParams["font.size"] = 14
36
+ plt.rcParams["font.family"] = "serif"
37
+ plt.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans", "Lucida Grande"]
38
+ plt.rcParams["font.serif"] = ["Times New Roman", "DejaVu Serif"]
39
+
40
+ # Creating the plot
41
+ plt.figure(figsize=(7.766, 4.8)) # Golden ratio
42
+ for loss in self.loss_list:
43
+ if plot_val:
44
+ (line,) = plt.plot(loss[1], loss[2], label=loss[0], linewidth=0.5, alpha=0.5)
45
+ line_color = line.get_color()
46
+ plt.plot(loss[3], loss[4], linewidth=1.5, color=line_color)
47
+ else:
48
+ plt.plot(loss[1], loss[2], label=loss[0], linewidth=1)
49
+ plt.xlabel("Step")
50
+ plt.ylabel("Loss")
51
+ legend = plt.legend()
52
+ # legend = plt.legend(loc='upper right', bbox_to_anchor=(1, 0.82))
53
+
54
+ # Adjust the linewidth of legend
55
+ for line in legend.get_lines():
56
+ line.set_linewidth(2)
57
+
58
+ plt.savefig(save_path, transparent=True)
59
+ plt.close()
60
+
61
+
62
+ if __name__ == "__main__":
63
+ chart = Chart()
64
+ # chart.add_ckpt("output/syncnet/train-2024_10_25-18:14:43/checkpoints/checkpoint-10000.pt", "w/ self-attn")
65
+ # chart.add_ckpt("output/syncnet/train-2024_10_25-18:21:59/checkpoints/checkpoint-10000.pt", "w/o self-attn")
66
+ chart.add_ckpt("output/syncnet/train-2024_10_24-21:03:11/checkpoints/checkpoint-10000.pt", "Dim 512")
67
+ chart.add_ckpt("output/syncnet/train-2024_10_25-18:21:59/checkpoints/checkpoint-10000.pt", "Dim 2048")
68
+ chart.add_ckpt("output/syncnet/train-2024_10_24-22:37:04/checkpoints/checkpoint-10000.pt", "Dim 4096")
69
+ chart.add_ckpt("output/syncnet/train-2024_10_25-02:30:17/checkpoints/checkpoint-10000.pt", "Dim 6144")
70
+ chart.draw("ablation.pdf", plot_val=True)
eval/eval_fvd.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import mediapipe as mp
16
+ import cv2
17
+ from decord import VideoReader
18
+ from einops import rearrange
19
+ import os
20
+ import numpy as np
21
+ import torch
22
+ import tqdm
23
+ from eval.fvd import compute_our_fvd
24
+
25
+
26
+ class FVD:
27
+ def __init__(self, resolution=(224, 224)):
28
+ self.face_detector = mp.solutions.face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.5)
29
+ self.resolution = resolution
30
+
31
+ def detect_face(self, image):
32
+ height, width = image.shape[:2]
33
+ # Process the image and detect faces.
34
+ results = self.face_detector.process(image)
35
+
36
+ if not results.detections: # Face not detected
37
+ raise Exception("Face not detected")
38
+
39
+ detection = results.detections[0] # Only use the first face in the image
40
+ bounding_box = detection.location_data.relative_bounding_box
41
+ xmin = int(bounding_box.xmin * width)
42
+ ymin = int(bounding_box.ymin * height)
43
+ face_width = int(bounding_box.width * width)
44
+ face_height = int(bounding_box.height * height)
45
+
46
+ # Crop the image to the bounding box.
47
+ xmin = max(0, xmin)
48
+ ymin = max(0, ymin)
49
+ xmax = min(width, xmin + face_width)
50
+ ymax = min(height, ymin + face_height)
51
+ image = image[ymin:ymax, xmin:xmax]
52
+
53
+ return image
54
+
55
+ def detect_video(self, video_path, real: bool = True):
56
+ vr = VideoReader(video_path)
57
+ video_frames = vr[20:36].asnumpy() # Use one frame per second
58
+ vr.seek(0) # avoid memory leak
59
+ faces = []
60
+ for frame in video_frames:
61
+ face = self.detect_face(frame)
62
+ face = cv2.resize(face, (self.resolution[1], self.resolution[0]), interpolation=cv2.INTER_AREA)
63
+ faces.append(face)
64
+
65
+ if len(faces) != 16:
66
+ return None
67
+ faces = np.stack(faces, axis=0) # (f, h, w, c)
68
+ faces = torch.from_numpy(faces)
69
+ return faces
70
+
71
+
72
+ def eval_fvd(real_videos_dir, fake_videos_dir):
73
+ fvd = FVD()
74
+ real_features_list = []
75
+ fake_features_list = []
76
+ for file in tqdm.tqdm(os.listdir(fake_videos_dir)):
77
+ if file.endswith(".mp4"):
78
+ real_video_path = os.path.join(real_videos_dir, file.replace("_out.mp4", ".mp4"))
79
+ fake_video_path = os.path.join(fake_videos_dir, file)
80
+ real_features = fvd.detect_video(real_video_path, real=True)
81
+ fake_features = fvd.detect_video(fake_video_path, real=False)
82
+ if real_features is None or fake_features is None:
83
+ continue
84
+ real_features_list.append(real_features)
85
+ fake_features_list.append(fake_features)
86
+
87
+ real_features = torch.stack(real_features_list) / 255.0
88
+ fake_features = torch.stack(fake_features_list) / 255.0
89
+ print(compute_our_fvd(real_features, fake_features, device="cpu"))
90
+
91
+
92
+ if __name__ == "__main__":
93
+ real_videos_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/segmented/cross"
94
+ fake_videos_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/segmented/latentsync_cross"
95
+
96
+ eval_fvd(real_videos_dir, fake_videos_dir)
eval/eval_sync_conf.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ import os
17
+ import tqdm
18
+ from statistics import fmean
19
+ from eval.syncnet import SyncNetEval
20
+ from eval.syncnet_detect import SyncNetDetector
21
+ from latentsync.utils.util import red_text
22
+ import torch
23
+
24
+
25
+ def syncnet_eval(syncnet, syncnet_detector, video_path, temp_dir, detect_results_dir="detect_results"):
26
+ syncnet_detector(video_path=video_path, min_track=50)
27
+ crop_videos = os.listdir(os.path.join(detect_results_dir, "crop"))
28
+ if crop_videos == []:
29
+ raise Exception(red_text(f"Face not detected in {video_path}"))
30
+ av_offset_list = []
31
+ conf_list = []
32
+ for video in crop_videos:
33
+ av_offset, _, conf = syncnet.evaluate(
34
+ video_path=os.path.join(detect_results_dir, "crop", video), temp_dir=temp_dir
35
+ )
36
+ av_offset_list.append(av_offset)
37
+ conf_list.append(conf)
38
+ av_offset = int(fmean(av_offset_list))
39
+ conf = fmean(conf_list)
40
+ print(f"Input video: {video_path}\nSyncNet confidence: {conf:.2f}\nAV offset: {av_offset}")
41
+ return av_offset, conf
42
+
43
+
44
+ def main():
45
+ parser = argparse.ArgumentParser(description="SyncNet")
46
+ parser.add_argument("--initial_model", type=str, default="checkpoints/auxiliary/syncnet_v2.model", help="")
47
+ parser.add_argument("--video_path", type=str, default=None, help="")
48
+ parser.add_argument("--videos_dir", type=str, default="/root/processed")
49
+ parser.add_argument("--temp_dir", type=str, default="temp", help="")
50
+
51
+ args = parser.parse_args()
52
+
53
+ device = "cuda" if torch.cuda.is_available() else "cpu"
54
+
55
+ syncnet = SyncNetEval(device=device)
56
+ syncnet.loadParameters(args.initial_model)
57
+
58
+ syncnet_detector = SyncNetDetector(device=device, detect_results_dir="detect_results")
59
+
60
+ if args.video_path is not None:
61
+ syncnet_eval(syncnet, syncnet_detector, args.video_path, args.temp_dir)
62
+ else:
63
+ sync_conf_list = []
64
+ video_names = sorted([f for f in os.listdir(args.videos_dir) if f.endswith(".mp4")])
65
+ for video_name in tqdm.tqdm(video_names):
66
+ try:
67
+ _, conf = syncnet_eval(
68
+ syncnet, syncnet_detector, os.path.join(args.videos_dir, video_name), args.temp_dir
69
+ )
70
+ sync_conf_list.append(conf)
71
+ except Exception as e:
72
+ print(e)
73
+ print(f"The average sync confidence is {fmean(sync_conf_list):.02f}")
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
eval/eval_sync_conf.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ #!/bin/bash
2
+ python -m eval.eval_sync_conf --video_path "RD_Radio1_000_006_out.mp4"
eval/eval_syncnet_acc.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import argparse
16
+ from tqdm.auto import tqdm
17
+ import torch
18
+ import torch.nn as nn
19
+ from einops import rearrange
20
+ from latentsync.models.syncnet import SyncNet
21
+ from latentsync.data.syncnet_dataset import SyncNetDataset
22
+ from diffusers import AutoencoderKL
23
+ from omegaconf import OmegaConf
24
+ from accelerate.utils import set_seed
25
+
26
+
27
+ def main(config):
28
+ set_seed(config.run.seed)
29
+
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
+
32
+ if config.data.latent_space:
33
+ vae = AutoencoderKL.from_pretrained(
34
+ "runwayml/stable-diffusion-inpainting", subfolder="vae", revision="fp16", torch_dtype=torch.float16
35
+ )
36
+ vae.requires_grad_(False)
37
+ vae.to(device)
38
+
39
+ # Dataset and Dataloader setup
40
+ dataset = SyncNetDataset(config.data.val_data_dir, config.data.val_fileslist, config)
41
+
42
+ test_dataloader = torch.utils.data.DataLoader(
43
+ dataset,
44
+ batch_size=config.data.batch_size,
45
+ shuffle=False,
46
+ num_workers=config.data.num_workers,
47
+ drop_last=False,
48
+ worker_init_fn=dataset.worker_init_fn,
49
+ )
50
+
51
+ # Model
52
+ syncnet = SyncNet(OmegaConf.to_container(config.model)).to(device)
53
+
54
+ print(f"Load checkpoint from: {config.ckpt.inference_ckpt_path}")
55
+ checkpoint = torch.load(config.ckpt.inference_ckpt_path, map_location=device)
56
+
57
+ syncnet.load_state_dict(checkpoint["state_dict"])
58
+ syncnet.to(dtype=torch.float16)
59
+ syncnet.requires_grad_(False)
60
+ syncnet.eval()
61
+
62
+ global_step = 0
63
+ num_val_batches = config.data.num_val_samples // config.data.batch_size
64
+ progress_bar = tqdm(range(0, num_val_batches), initial=0, desc="Testing accuracy")
65
+
66
+ num_correct_preds = 0
67
+ num_total_preds = 0
68
+
69
+ while True:
70
+ for step, batch in enumerate(test_dataloader):
71
+ ### >>>> Test >>>> ###
72
+
73
+ frames = batch["frames"].to(device, dtype=torch.float16)
74
+ audio_samples = batch["audio_samples"].to(device, dtype=torch.float16)
75
+ y = batch["y"].to(device, dtype=torch.float16).squeeze(1)
76
+
77
+ if config.data.latent_space:
78
+ frames = rearrange(frames, "b f c h w -> (b f) c h w")
79
+
80
+ with torch.no_grad():
81
+ frames = vae.encode(frames).latent_dist.sample() * 0.18215
82
+
83
+ frames = rearrange(frames, "(b f) c h w -> b (f c) h w", f=config.data.num_frames)
84
+ else:
85
+ frames = rearrange(frames, "b f c h w -> b (f c) h w")
86
+
87
+ if config.data.lower_half:
88
+ height = frames.shape[2]
89
+ frames = frames[:, :, height // 2 :, :]
90
+
91
+ with torch.no_grad():
92
+ vision_embeds, audio_embeds = syncnet(frames, audio_samples)
93
+
94
+ sims = nn.functional.cosine_similarity(vision_embeds, audio_embeds)
95
+
96
+ preds = (sims > 0.5).to(dtype=torch.float16)
97
+ num_correct_preds += (preds == y).sum().item()
98
+ num_total_preds += len(sims)
99
+
100
+ progress_bar.update(1)
101
+ global_step += 1
102
+
103
+ if global_step >= num_val_batches:
104
+ progress_bar.close()
105
+ print(f"Accuracy score: {num_correct_preds / num_total_preds*100:.2f}%")
106
+ return
107
+
108
+
109
+ if __name__ == "__main__":
110
+ parser = argparse.ArgumentParser(description="Code to test the accuracy of expert lip-sync discriminator")
111
+
112
+ parser.add_argument("--config_path", type=str, default="configs/syncnet/syncnet_16_latent.yaml")
113
+ args = parser.parse_args()
114
+
115
+ # Load a configuration file
116
+ config = OmegaConf.load(args.config_path)
117
+
118
+ main(config)
eval/eval_syncnet_acc.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ python -m eval.eval_syncnet_acc --config_path "configs/syncnet/syncnet_16_pixel.yaml"
eval/fvd.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/universome/fvd-comparison/blob/master/our_fvd.py
2
+
3
+ from typing import Tuple
4
+ import scipy
5
+ import numpy as np
6
+ import torch
7
+
8
+
9
+ def compute_fvd(feats_fake: np.ndarray, feats_real: np.ndarray) -> float:
10
+ mu_gen, sigma_gen = compute_stats(feats_fake)
11
+ mu_real, sigma_real = compute_stats(feats_real)
12
+
13
+ m = np.square(mu_gen - mu_real).sum()
14
+ s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member
15
+ fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
16
+
17
+ return float(fid)
18
+
19
+
20
+ def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
21
+ mu = feats.mean(axis=0) # [d]
22
+ sigma = np.cov(feats, rowvar=False) # [d, d]
23
+
24
+ return mu, sigma
25
+
26
+
27
+ @torch.no_grad()
28
+ def compute_our_fvd(videos_fake: np.ndarray, videos_real: np.ndarray, device: str = "cuda") -> float:
29
+ i3d_path = "checkpoints/auxiliary/i3d_torchscript.pt"
30
+ i3d_kwargs = dict(
31
+ rescale=False, resize=False, return_features=True
32
+ ) # Return raw features before the softmax layer.
33
+
34
+ with open(i3d_path, "rb") as f:
35
+ i3d_model = torch.jit.load(f).eval().to(device)
36
+
37
+ videos_fake = videos_fake.permute(0, 4, 1, 2, 3).to(device)
38
+ videos_real = videos_real.permute(0, 4, 1, 2, 3).to(device)
39
+
40
+ feats_fake = i3d_model(videos_fake, **i3d_kwargs).cpu().numpy()
41
+ feats_real = i3d_model(videos_real, **i3d_kwargs).cpu().numpy()
42
+
43
+ return compute_fvd(feats_fake, feats_real)
44
+
45
+
46
+ def main():
47
+ # input shape: (b, f, h, w, c)
48
+ videos_fake = torch.rand(10, 16, 224, 224, 3)
49
+ videos_real = torch.rand(10, 16, 224, 224, 3)
50
+
51
+ our_fvd_result = compute_our_fvd(videos_fake, videos_real)
52
+ print(f"[FVD scores] Ours: {our_fvd_result}")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
eval/hyper_iqa.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/SSL92/hyperIQA/blob/master/models.py
2
+
3
+ import torch as torch
4
+ import torch.nn as nn
5
+ from torch.nn import functional as F
6
+ from torch.nn import init
7
+ import math
8
+ import torch.utils.model_zoo as model_zoo
9
+
10
+ model_urls = {
11
+ 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
12
+ 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
13
+ 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
14
+ 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
15
+ 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
16
+ }
17
+
18
+
19
+ class HyperNet(nn.Module):
20
+ """
21
+ Hyper network for learning perceptual rules.
22
+
23
+ Args:
24
+ lda_out_channels: local distortion aware module output size.
25
+ hyper_in_channels: input feature channels for hyper network.
26
+ target_in_size: input vector size for target network.
27
+ target_fc(i)_size: fully connection layer size of target network.
28
+ feature_size: input feature map width/height for hyper network.
29
+
30
+ Note:
31
+ For size match, input args must satisfy: 'target_fc(i)_size * target_fc(i+1)_size' is divisible by 'feature_size ^ 2'.
32
+
33
+ """
34
+ def __init__(self, lda_out_channels, hyper_in_channels, target_in_size, target_fc1_size, target_fc2_size, target_fc3_size, target_fc4_size, feature_size):
35
+ super(HyperNet, self).__init__()
36
+
37
+ self.hyperInChn = hyper_in_channels
38
+ self.target_in_size = target_in_size
39
+ self.f1 = target_fc1_size
40
+ self.f2 = target_fc2_size
41
+ self.f3 = target_fc3_size
42
+ self.f4 = target_fc4_size
43
+ self.feature_size = feature_size
44
+
45
+ self.res = resnet50_backbone(lda_out_channels, target_in_size, pretrained=True)
46
+
47
+ self.pool = nn.AdaptiveAvgPool2d((1, 1))
48
+
49
+ # Conv layers for resnet output features
50
+ self.conv1 = nn.Sequential(
51
+ nn.Conv2d(2048, 1024, 1, padding=(0, 0)),
52
+ nn.ReLU(inplace=True),
53
+ nn.Conv2d(1024, 512, 1, padding=(0, 0)),
54
+ nn.ReLU(inplace=True),
55
+ nn.Conv2d(512, self.hyperInChn, 1, padding=(0, 0)),
56
+ nn.ReLU(inplace=True)
57
+ )
58
+
59
+ # Hyper network part, conv for generating target fc weights, fc for generating target fc biases
60
+ self.fc1w_conv = nn.Conv2d(self.hyperInChn, int(self.target_in_size * self.f1 / feature_size ** 2), 3, padding=(1, 1))
61
+ self.fc1b_fc = nn.Linear(self.hyperInChn, self.f1)
62
+
63
+ self.fc2w_conv = nn.Conv2d(self.hyperInChn, int(self.f1 * self.f2 / feature_size ** 2), 3, padding=(1, 1))
64
+ self.fc2b_fc = nn.Linear(self.hyperInChn, self.f2)
65
+
66
+ self.fc3w_conv = nn.Conv2d(self.hyperInChn, int(self.f2 * self.f3 / feature_size ** 2), 3, padding=(1, 1))
67
+ self.fc3b_fc = nn.Linear(self.hyperInChn, self.f3)
68
+
69
+ self.fc4w_conv = nn.Conv2d(self.hyperInChn, int(self.f3 * self.f4 / feature_size ** 2), 3, padding=(1, 1))
70
+ self.fc4b_fc = nn.Linear(self.hyperInChn, self.f4)
71
+
72
+ self.fc5w_fc = nn.Linear(self.hyperInChn, self.f4)
73
+ self.fc5b_fc = nn.Linear(self.hyperInChn, 1)
74
+
75
+ # initialize
76
+ for i, m_name in enumerate(self._modules):
77
+ if i > 2:
78
+ nn.init.kaiming_normal_(self._modules[m_name].weight.data)
79
+
80
+ def forward(self, img):
81
+ feature_size = self.feature_size
82
+
83
+ res_out = self.res(img)
84
+
85
+ # input vector for target net
86
+ target_in_vec = res_out['target_in_vec'].reshape(-1, self.target_in_size, 1, 1)
87
+
88
+ # input features for hyper net
89
+ hyper_in_feat = self.conv1(res_out['hyper_in_feat']).reshape(-1, self.hyperInChn, feature_size, feature_size)
90
+
91
+ # generating target net weights & biases
92
+ target_fc1w = self.fc1w_conv(hyper_in_feat).reshape(-1, self.f1, self.target_in_size, 1, 1)
93
+ target_fc1b = self.fc1b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, self.f1)
94
+
95
+ target_fc2w = self.fc2w_conv(hyper_in_feat).reshape(-1, self.f2, self.f1, 1, 1)
96
+ target_fc2b = self.fc2b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, self.f2)
97
+
98
+ target_fc3w = self.fc3w_conv(hyper_in_feat).reshape(-1, self.f3, self.f2, 1, 1)
99
+ target_fc3b = self.fc3b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, self.f3)
100
+
101
+ target_fc4w = self.fc4w_conv(hyper_in_feat).reshape(-1, self.f4, self.f3, 1, 1)
102
+ target_fc4b = self.fc4b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, self.f4)
103
+
104
+ target_fc5w = self.fc5w_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, 1, self.f4, 1, 1)
105
+ target_fc5b = self.fc5b_fc(self.pool(hyper_in_feat).squeeze()).reshape(-1, 1)
106
+
107
+ out = {}
108
+ out['target_in_vec'] = target_in_vec
109
+ out['target_fc1w'] = target_fc1w
110
+ out['target_fc1b'] = target_fc1b
111
+ out['target_fc2w'] = target_fc2w
112
+ out['target_fc2b'] = target_fc2b
113
+ out['target_fc3w'] = target_fc3w
114
+ out['target_fc3b'] = target_fc3b
115
+ out['target_fc4w'] = target_fc4w
116
+ out['target_fc4b'] = target_fc4b
117
+ out['target_fc5w'] = target_fc5w
118
+ out['target_fc5b'] = target_fc5b
119
+
120
+ return out
121
+
122
+
123
+ class TargetNet(nn.Module):
124
+ """
125
+ Target network for quality prediction.
126
+ """
127
+ def __init__(self, paras):
128
+ super(TargetNet, self).__init__()
129
+ self.l1 = nn.Sequential(
130
+ TargetFC(paras['target_fc1w'], paras['target_fc1b']),
131
+ nn.Sigmoid(),
132
+ )
133
+ self.l2 = nn.Sequential(
134
+ TargetFC(paras['target_fc2w'], paras['target_fc2b']),
135
+ nn.Sigmoid(),
136
+ )
137
+
138
+ self.l3 = nn.Sequential(
139
+ TargetFC(paras['target_fc3w'], paras['target_fc3b']),
140
+ nn.Sigmoid(),
141
+ )
142
+
143
+ self.l4 = nn.Sequential(
144
+ TargetFC(paras['target_fc4w'], paras['target_fc4b']),
145
+ nn.Sigmoid(),
146
+ TargetFC(paras['target_fc5w'], paras['target_fc5b']),
147
+ )
148
+
149
+ def forward(self, x):
150
+ q = self.l1(x)
151
+ # q = F.dropout(q)
152
+ q = self.l2(q)
153
+ q = self.l3(q)
154
+ q = self.l4(q).squeeze()
155
+ return q
156
+
157
+
158
+ class TargetFC(nn.Module):
159
+ """
160
+ Fully connection operations for target net
161
+
162
+ Note:
163
+ Weights & biases are different for different images in a batch,
164
+ thus here we use group convolution for calculating images in a batch with individual weights & biases.
165
+ """
166
+ def __init__(self, weight, bias):
167
+ super(TargetFC, self).__init__()
168
+ self.weight = weight
169
+ self.bias = bias
170
+
171
+ def forward(self, input_):
172
+
173
+ input_re = input_.reshape(-1, input_.shape[0] * input_.shape[1], input_.shape[2], input_.shape[3])
174
+ weight_re = self.weight.reshape(self.weight.shape[0] * self.weight.shape[1], self.weight.shape[2], self.weight.shape[3], self.weight.shape[4])
175
+ bias_re = self.bias.reshape(self.bias.shape[0] * self.bias.shape[1])
176
+ out = F.conv2d(input=input_re, weight=weight_re, bias=bias_re, groups=self.weight.shape[0])
177
+
178
+ return out.reshape(input_.shape[0], self.weight.shape[1], input_.shape[2], input_.shape[3])
179
+
180
+
181
+ class Bottleneck(nn.Module):
182
+ expansion = 4
183
+
184
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
185
+ super(Bottleneck, self).__init__()
186
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
187
+ self.bn1 = nn.BatchNorm2d(planes)
188
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
189
+ padding=1, bias=False)
190
+ self.bn2 = nn.BatchNorm2d(planes)
191
+ self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
192
+ self.bn3 = nn.BatchNorm2d(planes * 4)
193
+ self.relu = nn.ReLU(inplace=True)
194
+ self.downsample = downsample
195
+ self.stride = stride
196
+
197
+ def forward(self, x):
198
+ residual = x
199
+
200
+ out = self.conv1(x)
201
+ out = self.bn1(out)
202
+ out = self.relu(out)
203
+
204
+ out = self.conv2(out)
205
+ out = self.bn2(out)
206
+ out = self.relu(out)
207
+
208
+ out = self.conv3(out)
209
+ out = self.bn3(out)
210
+
211
+ if self.downsample is not None:
212
+ residual = self.downsample(x)
213
+
214
+ out += residual
215
+ out = self.relu(out)
216
+
217
+ return out
218
+
219
+
220
+ class ResNetBackbone(nn.Module):
221
+
222
+ def __init__(self, lda_out_channels, in_chn, block, layers, num_classes=1000):
223
+ super(ResNetBackbone, self).__init__()
224
+ self.inplanes = 64
225
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
226
+ self.bn1 = nn.BatchNorm2d(64)
227
+ self.relu = nn.ReLU(inplace=True)
228
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
229
+ self.layer1 = self._make_layer(block, 64, layers[0])
230
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
231
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
232
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
233
+
234
+ # local distortion aware module
235
+ self.lda1_pool = nn.Sequential(
236
+ nn.Conv2d(256, 16, kernel_size=1, stride=1, padding=0, bias=False),
237
+ nn.AvgPool2d(7, stride=7),
238
+ )
239
+ self.lda1_fc = nn.Linear(16 * 64, lda_out_channels)
240
+
241
+ self.lda2_pool = nn.Sequential(
242
+ nn.Conv2d(512, 32, kernel_size=1, stride=1, padding=0, bias=False),
243
+ nn.AvgPool2d(7, stride=7),
244
+ )
245
+ self.lda2_fc = nn.Linear(32 * 16, lda_out_channels)
246
+
247
+ self.lda3_pool = nn.Sequential(
248
+ nn.Conv2d(1024, 64, kernel_size=1, stride=1, padding=0, bias=False),
249
+ nn.AvgPool2d(7, stride=7),
250
+ )
251
+ self.lda3_fc = nn.Linear(64 * 4, lda_out_channels)
252
+
253
+ self.lda4_pool = nn.AvgPool2d(7, stride=7)
254
+ self.lda4_fc = nn.Linear(2048, in_chn - lda_out_channels * 3)
255
+
256
+ for m in self.modules():
257
+ if isinstance(m, nn.Conv2d):
258
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
259
+ m.weight.data.normal_(0, math.sqrt(2. / n))
260
+ elif isinstance(m, nn.BatchNorm2d):
261
+ m.weight.data.fill_(1)
262
+ m.bias.data.zero_()
263
+
264
+ # initialize
265
+ nn.init.kaiming_normal_(self.lda1_pool._modules['0'].weight.data)
266
+ nn.init.kaiming_normal_(self.lda2_pool._modules['0'].weight.data)
267
+ nn.init.kaiming_normal_(self.lda3_pool._modules['0'].weight.data)
268
+ nn.init.kaiming_normal_(self.lda1_fc.weight.data)
269
+ nn.init.kaiming_normal_(self.lda2_fc.weight.data)
270
+ nn.init.kaiming_normal_(self.lda3_fc.weight.data)
271
+ nn.init.kaiming_normal_(self.lda4_fc.weight.data)
272
+
273
+ def _make_layer(self, block, planes, blocks, stride=1):
274
+ downsample = None
275
+ if stride != 1 or self.inplanes != planes * block.expansion:
276
+ downsample = nn.Sequential(
277
+ nn.Conv2d(self.inplanes, planes * block.expansion,
278
+ kernel_size=1, stride=stride, bias=False),
279
+ nn.BatchNorm2d(planes * block.expansion),
280
+ )
281
+
282
+ layers = []
283
+ layers.append(block(self.inplanes, planes, stride, downsample))
284
+ self.inplanes = planes * block.expansion
285
+ for i in range(1, blocks):
286
+ layers.append(block(self.inplanes, planes))
287
+
288
+ return nn.Sequential(*layers)
289
+
290
+ def forward(self, x):
291
+ x = self.conv1(x)
292
+ x = self.bn1(x)
293
+ x = self.relu(x)
294
+ x = self.maxpool(x)
295
+ x = self.layer1(x)
296
+
297
+ # the same effect as lda operation in the paper, but save much more memory
298
+ lda_1 = self.lda1_fc(self.lda1_pool(x).reshape(x.size(0), -1))
299
+ x = self.layer2(x)
300
+ lda_2 = self.lda2_fc(self.lda2_pool(x).reshape(x.size(0), -1))
301
+ x = self.layer3(x)
302
+ lda_3 = self.lda3_fc(self.lda3_pool(x).reshape(x.size(0), -1))
303
+ x = self.layer4(x)
304
+ lda_4 = self.lda4_fc(self.lda4_pool(x).reshape(x.size(0), -1))
305
+
306
+ vec = torch.cat((lda_1, lda_2, lda_3, lda_4), 1)
307
+
308
+ out = {}
309
+ out['hyper_in_feat'] = x
310
+ out['target_in_vec'] = vec
311
+
312
+ return out
313
+
314
+
315
+ def resnet50_backbone(lda_out_channels, in_chn, pretrained=False, **kwargs):
316
+ """Constructs a ResNet-50 model_hyper.
317
+
318
+ Args:
319
+ pretrained (bool): If True, returns a model_hyper pre-trained on ImageNet
320
+ """
321
+ model = ResNetBackbone(lda_out_channels, in_chn, Bottleneck, [3, 4, 6, 3], **kwargs)
322
+ if pretrained:
323
+ save_model = model_zoo.load_url(model_urls['resnet50'])
324
+ model_dict = model.state_dict()
325
+ state_dict = {k: v for k, v in save_model.items() if k in model_dict.keys()}
326
+ model_dict.update(state_dict)
327
+ model.load_state_dict(model_dict)
328
+ else:
329
+ model.apply(weights_init_xavier)
330
+ return model
331
+
332
+
333
+ def weights_init_xavier(m):
334
+ classname = m.__class__.__name__
335
+ # print(classname)
336
+ # if isinstance(m, nn.Conv2d):
337
+ if classname.find('Conv') != -1:
338
+ init.kaiming_normal_(m.weight.data)
339
+ elif classname.find('Linear') != -1:
340
+ init.kaiming_normal_(m.weight.data)
341
+ elif classname.find('BatchNorm2d') != -1:
342
+ init.uniform_(m.weight.data, 1.0, 0.02)
343
+ init.constant_(m.bias.data, 0.0)
eval/inference_videos.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import subprocess
17
+ from tqdm import tqdm
18
+
19
+
20
+ def inference_video_from_dir(input_dir, output_dir, unet_config_path, ckpt_path):
21
+ os.makedirs(output_dir, exist_ok=True)
22
+ video_names = sorted([f for f in os.listdir(input_dir) if f.endswith(".mp4")])
23
+ for video_name in tqdm(video_names):
24
+ video_path = os.path.join(input_dir, video_name)
25
+ audio_path = os.path.join(input_dir, video_name.replace(".mp4", "_audio.wav"))
26
+ video_out_path = os.path.join(output_dir, video_name.replace(".mp4", "_out.mp4"))
27
+ inference_command = f"python inference.py --unet_config_path {unet_config_path} --video_path {video_path} --audio_path {audio_path} --video_out_path {video_out_path} --inference_ckpt_path {ckpt_path} --seed 1247"
28
+ subprocess.run(inference_command, shell=True)
29
+
30
+
31
+ if __name__ == "__main__":
32
+ input_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/HDTF/segmented/cross"
33
+ output_dir = "/mnt/bn/maliva-gen-ai-v2/chunyu.li/HDTF/segmented/latentsync_cross"
34
+ unet_config_path = "configs/unet/unet_latent_16_diffusion.yaml"
35
+ ckpt_path = "output/unet/train-2024_10_08-16:23:43/checkpoints/checkpoint-1920000.pt"
36
+
37
+ inference_video_from_dir(input_dir, output_dir, unet_config_path, ckpt_path)
eval/syncnet/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .syncnet_eval import SyncNetEval
eval/syncnet/syncnet.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/joonson/syncnet_python/blob/master/SyncNetModel.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ def save(model, filename):
8
+ with open(filename, "wb") as f:
9
+ torch.save(model, f)
10
+ print("%s saved." % filename)
11
+
12
+
13
+ def load(filename):
14
+ net = torch.load(filename)
15
+ return net
16
+
17
+
18
+ class S(nn.Module):
19
+ def __init__(self, num_layers_in_fc_layers=1024):
20
+ super(S, self).__init__()
21
+
22
+ self.__nFeatures__ = 24
23
+ self.__nChs__ = 32
24
+ self.__midChs__ = 32
25
+
26
+ self.netcnnaud = nn.Sequential(
27
+ nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
28
+ nn.BatchNorm2d(64),
29
+ nn.ReLU(inplace=True),
30
+ nn.MaxPool2d(kernel_size=(1, 1), stride=(1, 1)),
31
+ nn.Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
32
+ nn.BatchNorm2d(192),
33
+ nn.ReLU(inplace=True),
34
+ nn.MaxPool2d(kernel_size=(3, 3), stride=(1, 2)),
35
+ nn.Conv2d(192, 384, kernel_size=(3, 3), padding=(1, 1)),
36
+ nn.BatchNorm2d(384),
37
+ nn.ReLU(inplace=True),
38
+ nn.Conv2d(384, 256, kernel_size=(3, 3), padding=(1, 1)),
39
+ nn.BatchNorm2d(256),
40
+ nn.ReLU(inplace=True),
41
+ nn.Conv2d(256, 256, kernel_size=(3, 3), padding=(1, 1)),
42
+ nn.BatchNorm2d(256),
43
+ nn.ReLU(inplace=True),
44
+ nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2)),
45
+ nn.Conv2d(256, 512, kernel_size=(5, 4), padding=(0, 0)),
46
+ nn.BatchNorm2d(512),
47
+ nn.ReLU(),
48
+ )
49
+
50
+ self.netfcaud = nn.Sequential(
51
+ nn.Linear(512, 512),
52
+ nn.BatchNorm1d(512),
53
+ nn.ReLU(),
54
+ nn.Linear(512, num_layers_in_fc_layers),
55
+ )
56
+
57
+ self.netfclip = nn.Sequential(
58
+ nn.Linear(512, 512),
59
+ nn.BatchNorm1d(512),
60
+ nn.ReLU(),
61
+ nn.Linear(512, num_layers_in_fc_layers),
62
+ )
63
+
64
+ self.netcnnlip = nn.Sequential(
65
+ nn.Conv3d(3, 96, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=0),
66
+ nn.BatchNorm3d(96),
67
+ nn.ReLU(inplace=True),
68
+ nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2)),
69
+ nn.Conv3d(96, 256, kernel_size=(1, 5, 5), stride=(1, 2, 2), padding=(0, 1, 1)),
70
+ nn.BatchNorm3d(256),
71
+ nn.ReLU(inplace=True),
72
+ nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)),
73
+ nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
74
+ nn.BatchNorm3d(256),
75
+ nn.ReLU(inplace=True),
76
+ nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
77
+ nn.BatchNorm3d(256),
78
+ nn.ReLU(inplace=True),
79
+ nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
80
+ nn.BatchNorm3d(256),
81
+ nn.ReLU(inplace=True),
82
+ nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2)),
83
+ nn.Conv3d(256, 512, kernel_size=(1, 6, 6), padding=0),
84
+ nn.BatchNorm3d(512),
85
+ nn.ReLU(inplace=True),
86
+ )
87
+
88
+ def forward_aud(self, x):
89
+
90
+ mid = self.netcnnaud(x)
91
+ # N x ch x 24 x M
92
+ mid = mid.view((mid.size()[0], -1))
93
+ # N x (ch x 24)
94
+ out = self.netfcaud(mid)
95
+
96
+ return out
97
+
98
+ def forward_lip(self, x):
99
+
100
+ mid = self.netcnnlip(x)
101
+ mid = mid.view((mid.size()[0], -1))
102
+ # N x (ch x 24)
103
+ out = self.netfclip(mid)
104
+
105
+ return out
106
+
107
+ def forward_lipfeat(self, x):
108
+
109
+ mid = self.netcnnlip(x)
110
+ out = mid.view((mid.size()[0], -1))
111
+ # N x (ch x 24)
112
+
113
+ return out
eval/syncnet/syncnet_eval.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/joonson/syncnet_python/blob/master/SyncNetInstance.py
2
+
3
+ import torch
4
+ import numpy
5
+ import time, pdb, argparse, subprocess, os, math, glob
6
+ import cv2
7
+ import python_speech_features
8
+
9
+ from scipy import signal
10
+ from scipy.io import wavfile
11
+ from .syncnet import S
12
+ from shutil import rmtree
13
+
14
+
15
+ # ==================== Get OFFSET ====================
16
+
17
+ # Video 25 FPS, Audio 16000HZ
18
+
19
+
20
+ def calc_pdist(feat1, feat2, vshift=10):
21
+ win_size = vshift * 2 + 1
22
+
23
+ feat2p = torch.nn.functional.pad(feat2, (0, 0, vshift, vshift))
24
+
25
+ dists = []
26
+
27
+ for i in range(0, len(feat1)):
28
+
29
+ dists.append(
30
+ torch.nn.functional.pairwise_distance(feat1[[i], :].repeat(win_size, 1), feat2p[i : i + win_size, :])
31
+ )
32
+
33
+ return dists
34
+
35
+
36
+ # ==================== MAIN DEF ====================
37
+
38
+
39
+ class SyncNetEval(torch.nn.Module):
40
+ def __init__(self, dropout=0, num_layers_in_fc_layers=1024, device="cpu"):
41
+ super().__init__()
42
+
43
+ self.__S__ = S(num_layers_in_fc_layers=num_layers_in_fc_layers).to(device)
44
+ self.device = device
45
+
46
+ def evaluate(self, video_path, temp_dir="temp", batch_size=20, vshift=15):
47
+
48
+ self.__S__.eval()
49
+
50
+ # ========== ==========
51
+ # Convert files
52
+ # ========== ==========
53
+
54
+ if os.path.exists(temp_dir):
55
+ rmtree(temp_dir)
56
+
57
+ os.makedirs(temp_dir)
58
+
59
+ # temp_video_path = os.path.join(temp_dir, "temp.mp4")
60
+ # command = f"ffmpeg -loglevel error -nostdin -y -i {video_path} -vf scale='224:224' {temp_video_path}"
61
+ # subprocess.call(command, shell=True)
62
+
63
+ command = (
64
+ f"ffmpeg -loglevel error -nostdin -y -i {video_path} -f image2 {os.path.join(temp_dir, '%06d.jpg')}"
65
+ )
66
+ subprocess.call(command, shell=True, stdout=None)
67
+
68
+ command = f"ffmpeg -loglevel error -nostdin -y -i {video_path} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {os.path.join(temp_dir, 'audio.wav')}"
69
+ subprocess.call(command, shell=True, stdout=None)
70
+
71
+ # ========== ==========
72
+ # Load video
73
+ # ========== ==========
74
+
75
+ images = []
76
+
77
+ flist = glob.glob(os.path.join(temp_dir, "*.jpg"))
78
+ flist.sort()
79
+
80
+ for fname in flist:
81
+ img_input = cv2.imread(fname)
82
+ img_input = cv2.resize(img_input, (224, 224)) # HARD CODED, CHANGE BEFORE RELEASE
83
+ images.append(img_input)
84
+
85
+ im = numpy.stack(images, axis=3)
86
+ im = numpy.expand_dims(im, axis=0)
87
+ im = numpy.transpose(im, (0, 3, 4, 1, 2))
88
+
89
+ imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
90
+
91
+ # ========== ==========
92
+ # Load audio
93
+ # ========== ==========
94
+
95
+ sample_rate, audio = wavfile.read(os.path.join(temp_dir, "audio.wav"))
96
+ mfcc = zip(*python_speech_features.mfcc(audio, sample_rate))
97
+ mfcc = numpy.stack([numpy.array(i) for i in mfcc])
98
+
99
+ cc = numpy.expand_dims(numpy.expand_dims(mfcc, axis=0), axis=0)
100
+ cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
101
+
102
+ # ========== ==========
103
+ # Check audio and video input length
104
+ # ========== ==========
105
+
106
+ # if (float(len(audio)) / 16000) != (float(len(images)) / 25):
107
+ # print(
108
+ # "WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."
109
+ # % (float(len(audio)) / 16000, float(len(images)) / 25)
110
+ # )
111
+
112
+ min_length = min(len(images), math.floor(len(audio) / 640))
113
+
114
+ # ========== ==========
115
+ # Generate video and audio feats
116
+ # ========== ==========
117
+
118
+ lastframe = min_length - 5
119
+ im_feat = []
120
+ cc_feat = []
121
+
122
+ tS = time.time()
123
+ for i in range(0, lastframe, batch_size):
124
+
125
+ im_batch = [imtv[:, :, vframe : vframe + 5, :, :] for vframe in range(i, min(lastframe, i + batch_size))]
126
+ im_in = torch.cat(im_batch, 0)
127
+ im_out = self.__S__.forward_lip(im_in.to(self.device))
128
+ im_feat.append(im_out.data.cpu())
129
+
130
+ cc_batch = [
131
+ cct[:, :, :, vframe * 4 : vframe * 4 + 20] for vframe in range(i, min(lastframe, i + batch_size))
132
+ ]
133
+ cc_in = torch.cat(cc_batch, 0)
134
+ cc_out = self.__S__.forward_aud(cc_in.to(self.device))
135
+ cc_feat.append(cc_out.data.cpu())
136
+
137
+ im_feat = torch.cat(im_feat, 0)
138
+ cc_feat = torch.cat(cc_feat, 0)
139
+
140
+ # ========== ==========
141
+ # Compute offset
142
+ # ========== ==========
143
+
144
+ dists = calc_pdist(im_feat, cc_feat, vshift=vshift)
145
+ mean_dists = torch.mean(torch.stack(dists, 1), 1)
146
+
147
+ min_dist, minidx = torch.min(mean_dists, 0)
148
+
149
+ av_offset = vshift - minidx
150
+ conf = torch.median(mean_dists) - min_dist
151
+
152
+ fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
153
+ # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
154
+ fconf = torch.median(mean_dists).numpy() - fdist
155
+ framewise_conf = signal.medfilt(fconf, kernel_size=9)
156
+
157
+ # numpy.set_printoptions(formatter={"float": "{: 0.3f}".format})
158
+ rmtree(temp_dir)
159
+ return av_offset.item(), min_dist.item(), conf.item()
160
+
161
+ def extract_feature(self, opt, videofile):
162
+
163
+ self.__S__.eval()
164
+
165
+ # ========== ==========
166
+ # Load video
167
+ # ========== ==========
168
+ cap = cv2.VideoCapture(videofile)
169
+
170
+ frame_num = 1
171
+ images = []
172
+ while frame_num:
173
+ frame_num += 1
174
+ ret, image = cap.read()
175
+ if ret == 0:
176
+ break
177
+
178
+ images.append(image)
179
+
180
+ im = numpy.stack(images, axis=3)
181
+ im = numpy.expand_dims(im, axis=0)
182
+ im = numpy.transpose(im, (0, 3, 4, 1, 2))
183
+
184
+ imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
185
+
186
+ # ========== ==========
187
+ # Generate video feats
188
+ # ========== ==========
189
+
190
+ lastframe = len(images) - 4
191
+ im_feat = []
192
+
193
+ tS = time.time()
194
+ for i in range(0, lastframe, opt.batch_size):
195
+
196
+ im_batch = [
197
+ imtv[:, :, vframe : vframe + 5, :, :] for vframe in range(i, min(lastframe, i + opt.batch_size))
198
+ ]
199
+ im_in = torch.cat(im_batch, 0)
200
+ im_out = self.__S__.forward_lipfeat(im_in.to(self.device))
201
+ im_feat.append(im_out.data.cpu())
202
+
203
+ im_feat = torch.cat(im_feat, 0)
204
+
205
+ # ========== ==========
206
+ # Compute offset
207
+ # ========== ==========
208
+
209
+ print("Compute time %.3f sec." % (time.time() - tS))
210
+
211
+ return im_feat
212
+
213
+ def loadParameters(self, path):
214
+ loaded_state = torch.load(path, map_location=lambda storage, loc: storage)
215
+
216
+ self_state = self.__S__.state_dict()
217
+
218
+ for name, param in loaded_state.items():
219
+
220
+ self_state[name].copy_(param)
eval/syncnet_detect.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/joonson/syncnet_python/blob/master/run_pipeline.py
2
+
3
+ import os, pdb, subprocess, glob, cv2
4
+ import numpy as np
5
+ from shutil import rmtree
6
+ import torch
7
+
8
+ from scenedetect.video_manager import VideoManager
9
+ from scenedetect.scene_manager import SceneManager
10
+ from scenedetect.stats_manager import StatsManager
11
+ from scenedetect.detectors import ContentDetector
12
+
13
+ from scipy.interpolate import interp1d
14
+ from scipy.io import wavfile
15
+ from scipy import signal
16
+
17
+ from eval.detectors import S3FD
18
+
19
+
20
+ class SyncNetDetector:
21
+ def __init__(self, device, detect_results_dir="detect_results"):
22
+ self.s3f_detector = S3FD(device=device)
23
+ self.detect_results_dir = detect_results_dir
24
+
25
+ def __call__(self, video_path: str, min_track=50, scale=False):
26
+ crop_dir = os.path.join(self.detect_results_dir, "crop")
27
+ video_dir = os.path.join(self.detect_results_dir, "video")
28
+ frames_dir = os.path.join(self.detect_results_dir, "frames")
29
+ temp_dir = os.path.join(self.detect_results_dir, "temp")
30
+
31
+ # ========== DELETE EXISTING DIRECTORIES ==========
32
+ if os.path.exists(crop_dir):
33
+ rmtree(crop_dir)
34
+
35
+ if os.path.exists(video_dir):
36
+ rmtree(video_dir)
37
+
38
+ if os.path.exists(frames_dir):
39
+ rmtree(frames_dir)
40
+
41
+ if os.path.exists(temp_dir):
42
+ rmtree(temp_dir)
43
+
44
+ # ========== MAKE NEW DIRECTORIES ==========
45
+
46
+ os.makedirs(crop_dir)
47
+ os.makedirs(video_dir)
48
+ os.makedirs(frames_dir)
49
+ os.makedirs(temp_dir)
50
+
51
+ # ========== CONVERT VIDEO AND EXTRACT FRAMES ==========
52
+
53
+ if scale:
54
+ scaled_video_path = os.path.join(video_dir, "scaled.mp4")
55
+ command = f"ffmpeg -loglevel error -y -nostdin -i {video_path} -vf scale='224:224' {scaled_video_path}"
56
+ subprocess.run(command, shell=True)
57
+ video_path = scaled_video_path
58
+
59
+ command = f"ffmpeg -y -nostdin -loglevel error -i {video_path} -qscale:v 2 -async 1 -r 25 {os.path.join(video_dir, 'video.mp4')}"
60
+ subprocess.run(command, shell=True, stdout=None)
61
+
62
+ command = f"ffmpeg -y -nostdin -loglevel error -i {os.path.join(video_dir, 'video.mp4')} -qscale:v 2 -f image2 {os.path.join(frames_dir, '%06d.jpg')}"
63
+ subprocess.run(command, shell=True, stdout=None)
64
+
65
+ command = f"ffmpeg -y -nostdin -loglevel error -i {os.path.join(video_dir, 'video.mp4')} -ac 1 -vn -acodec pcm_s16le -ar 16000 {os.path.join(video_dir, 'audio.wav')}"
66
+ subprocess.run(command, shell=True, stdout=None)
67
+
68
+ faces = self.detect_face(frames_dir)
69
+
70
+ scene = self.scene_detect(video_dir)
71
+
72
+ # Face tracking
73
+ alltracks = []
74
+
75
+ for shot in scene:
76
+ if shot[1].frame_num - shot[0].frame_num >= min_track:
77
+ alltracks.extend(self.track_face(faces[shot[0].frame_num : shot[1].frame_num], min_track=min_track))
78
+
79
+ # Face crop
80
+ for ii, track in enumerate(alltracks):
81
+ self.crop_video(track, os.path.join(crop_dir, "%05d" % ii), frames_dir, 25, temp_dir, video_dir)
82
+
83
+ rmtree(temp_dir)
84
+
85
+ def scene_detect(self, video_dir):
86
+ video_manager = VideoManager([os.path.join(video_dir, "video.mp4")])
87
+ stats_manager = StatsManager()
88
+ scene_manager = SceneManager(stats_manager)
89
+ # Add ContentDetector algorithm (constructor takes detector options like threshold).
90
+ scene_manager.add_detector(ContentDetector())
91
+ base_timecode = video_manager.get_base_timecode()
92
+
93
+ video_manager.set_downscale_factor()
94
+
95
+ video_manager.start()
96
+
97
+ scene_manager.detect_scenes(frame_source=video_manager)
98
+
99
+ scene_list = scene_manager.get_scene_list(base_timecode)
100
+
101
+ if scene_list == []:
102
+ scene_list = [(video_manager.get_base_timecode(), video_manager.get_current_timecode())]
103
+
104
+ return scene_list
105
+
106
+ def track_face(self, scenefaces, num_failed_det=25, min_track=50, min_face_size=100):
107
+
108
+ iouThres = 0.5 # Minimum IOU between consecutive face detections
109
+ tracks = []
110
+
111
+ while True:
112
+ track = []
113
+ for framefaces in scenefaces:
114
+ for face in framefaces:
115
+ if track == []:
116
+ track.append(face)
117
+ framefaces.remove(face)
118
+ elif face["frame"] - track[-1]["frame"] <= num_failed_det:
119
+ iou = bounding_box_iou(face["bbox"], track[-1]["bbox"])
120
+ if iou > iouThres:
121
+ track.append(face)
122
+ framefaces.remove(face)
123
+ continue
124
+ else:
125
+ break
126
+
127
+ if track == []:
128
+ break
129
+ elif len(track) > min_track:
130
+
131
+ framenum = np.array([f["frame"] for f in track])
132
+ bboxes = np.array([np.array(f["bbox"]) for f in track])
133
+
134
+ frame_i = np.arange(framenum[0], framenum[-1] + 1)
135
+
136
+ bboxes_i = []
137
+ for ij in range(0, 4):
138
+ interpfn = interp1d(framenum, bboxes[:, ij])
139
+ bboxes_i.append(interpfn(frame_i))
140
+ bboxes_i = np.stack(bboxes_i, axis=1)
141
+
142
+ if (
143
+ max(np.mean(bboxes_i[:, 2] - bboxes_i[:, 0]), np.mean(bboxes_i[:, 3] - bboxes_i[:, 1]))
144
+ > min_face_size
145
+ ):
146
+ tracks.append({"frame": frame_i, "bbox": bboxes_i})
147
+
148
+ return tracks
149
+
150
+ def detect_face(self, frames_dir, facedet_scale=0.25):
151
+ flist = glob.glob(os.path.join(frames_dir, "*.jpg"))
152
+ flist.sort()
153
+
154
+ dets = []
155
+
156
+ for fidx, fname in enumerate(flist):
157
+ image = cv2.imread(fname)
158
+
159
+ image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
160
+ bboxes = self.s3f_detector.detect_faces(image_np, conf_th=0.9, scales=[facedet_scale])
161
+
162
+ dets.append([])
163
+ for bbox in bboxes:
164
+ dets[-1].append({"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]})
165
+
166
+ return dets
167
+
168
+ def crop_video(self, track, cropfile, frames_dir, frame_rate, temp_dir, video_dir, crop_scale=0.4):
169
+
170
+ flist = glob.glob(os.path.join(frames_dir, "*.jpg"))
171
+ flist.sort()
172
+
173
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
174
+ vOut = cv2.VideoWriter(cropfile + "t.mp4", fourcc, frame_rate, (224, 224))
175
+
176
+ dets = {"x": [], "y": [], "s": []}
177
+
178
+ for det in track["bbox"]:
179
+
180
+ dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
181
+ dets["y"].append((det[1] + det[3]) / 2) # crop center x
182
+ dets["x"].append((det[0] + det[2]) / 2) # crop center y
183
+
184
+ # Smooth detections
185
+ dets["s"] = signal.medfilt(dets["s"], kernel_size=13)
186
+ dets["x"] = signal.medfilt(dets["x"], kernel_size=13)
187
+ dets["y"] = signal.medfilt(dets["y"], kernel_size=13)
188
+
189
+ for fidx, frame in enumerate(track["frame"]):
190
+
191
+ cs = crop_scale
192
+
193
+ bs = dets["s"][fidx] # Detection box size
194
+ bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount
195
+
196
+ image = cv2.imread(flist[frame])
197
+
198
+ frame = np.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)), "constant", constant_values=(110, 110))
199
+ my = dets["y"][fidx] + bsi # BBox center Y
200
+ mx = dets["x"][fidx] + bsi # BBox center X
201
+
202
+ face = frame[int(my - bs) : int(my + bs * (1 + 2 * cs)), int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs))]
203
+
204
+ vOut.write(cv2.resize(face, (224, 224)))
205
+
206
+ audiotmp = os.path.join(temp_dir, "audio.wav")
207
+ audiostart = (track["frame"][0]) / frame_rate
208
+ audioend = (track["frame"][-1] + 1) / frame_rate
209
+
210
+ vOut.release()
211
+
212
+ # ========== CROP AUDIO FILE ==========
213
+
214
+ command = "ffmpeg -y -nostdin -loglevel error -i %s -ss %.3f -to %.3f %s" % (
215
+ os.path.join(video_dir, "audio.wav"),
216
+ audiostart,
217
+ audioend,
218
+ audiotmp,
219
+ )
220
+ output = subprocess.run(command, shell=True, stdout=None)
221
+
222
+ sample_rate, audio = wavfile.read(audiotmp)
223
+
224
+ # ========== COMBINE AUDIO AND VIDEO FILES ==========
225
+
226
+ command = "ffmpeg -y -nostdin -loglevel error -i %st.mp4 -i %s -c:v copy -c:a aac %s.mp4" % (
227
+ cropfile,
228
+ audiotmp,
229
+ cropfile,
230
+ )
231
+ output = subprocess.run(command, shell=True, stdout=None)
232
+
233
+ os.remove(cropfile + "t.mp4")
234
+
235
+ return {"track": track, "proc_track": dets}
236
+
237
+
238
+ def bounding_box_iou(boxA, boxB):
239
+ xA = max(boxA[0], boxB[0])
240
+ yA = max(boxA[1], boxB[1])
241
+ xB = min(boxA[2], boxB[2])
242
+ yB = min(boxA[3], boxB[3])
243
+
244
+ interArea = max(0, xB - xA) * max(0, yB - yA)
245
+
246
+ boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
247
+ boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
248
+
249
+ iou = interArea / float(boxAArea + boxBArea - interArea)
250
+
251
+ return iou