zhangbo2008 commited on
Commit
6a20b7b
1 Parent(s): 665ec9c

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +15 -1
  2. .gitignore +16 -0
  3. 0.jpg +0 -0
  4. 1.py +305 -0
  5. 2.py +165 -0
  6. 3.py +284 -0
  7. 4.py +395 -0
  8. 888.py +3 -0
  9. 9999.py +15 -0
  10. README.md +152 -0
  11. __pycache__/audio.cpython-39.pyc +0 -0
  12. __pycache__/hparams.cpython-39.pyc +0 -0
  13. aaa.jpg +0 -0
  14. audio.py +135 -0
  15. checkpoints/README.md +1 -0
  16. checkpoints/lipsync_expert.pth +3 -0
  17. checkpoints/visual_quality_disc.pth +3 -0
  18. checkpoints/wav2lip.pth +3 -0
  19. checkpoints/wav2lip_gan.pth +3 -0
  20. color_syncnet_train.py +279 -0
  21. evaluation/README.md +63 -0
  22. evaluation/gen_videos_from_filelist.py +238 -0
  23. evaluation/real_videos_inference.py +305 -0
  24. evaluation/scores_LSE/SyncNetInstance_calc_scores.py +210 -0
  25. evaluation/scores_LSE/calculate_scores_LRS.py +53 -0
  26. evaluation/scores_LSE/calculate_scores_real_videos.py +45 -0
  27. evaluation/scores_LSE/calculate_scores_real_videos.sh +8 -0
  28. evaluation/test_filelists/README.md +13 -0
  29. evaluation/test_filelists/ReSyncED/random_pairs.txt +160 -0
  30. evaluation/test_filelists/ReSyncED/tts_pairs.txt +18 -0
  31. evaluation/test_filelists/lrs2.txt +0 -0
  32. evaluation/test_filelists/lrs3.txt +0 -0
  33. evaluation/test_filelists/lrw.txt +0 -0
  34. examples/driven_audio/RD_Radio31_000.wav +0 -0
  35. examples/driven_audio/RD_Radio34_002.wav +0 -0
  36. examples/driven_audio/RD_Radio36_000.wav +0 -0
  37. examples/driven_audio/RD_Radio40_000.wav +0 -0
  38. examples/driven_audio/bus_chinese.wav +0 -0
  39. examples/driven_audio/chinese_news.wav +3 -0
  40. examples/driven_audio/chinese_poem1.wav +0 -0
  41. examples/driven_audio/chinese_poem2.wav +0 -0
  42. examples/driven_audio/deyu.wav +3 -0
  43. examples/driven_audio/eluosi.wav +3 -0
  44. examples/driven_audio/fayu.wav +3 -0
  45. examples/driven_audio/imagine.wav +3 -0
  46. examples/driven_audio/itosinger1.wav +0 -0
  47. examples/driven_audio/japanese.wav +3 -0
  48. examples/ref_video/WDA_AlexandriaOcasioCortez_000.mp4 +3 -0
  49. examples/ref_video/WDA_KatieHill_000.mp4 +3 -0
  50. examples/source_image/art_0.png +0 -0
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ examples/driven_audio/chinese_news.wav filter=lfs diff=lfs merge=lfs -text
36
+ examples/driven_audio/deyu.wav filter=lfs diff=lfs merge=lfs -text
37
+ examples/driven_audio/eluosi.wav filter=lfs diff=lfs merge=lfs -text
38
+ examples/driven_audio/fayu.wav filter=lfs diff=lfs merge=lfs -text
39
+ examples/driven_audio/imagine.wav filter=lfs diff=lfs merge=lfs -text
40
+ examples/driven_audio/japanese.wav filter=lfs diff=lfs merge=lfs -text
41
+ examples/ref_video/WDA_AlexandriaOcasioCortez_000.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ examples/ref_video/WDA_KatieHill_000.mp4 filter=lfs diff=lfs merge=lfs -text
43
+ examples/source_image/art_16.png filter=lfs diff=lfs merge=lfs -text
44
+ examples/source_image/art_17.png filter=lfs diff=lfs merge=lfs -text
45
+ examples/source_image/art_3.png filter=lfs diff=lfs merge=lfs -text
46
+ examples/source_image/art_4.png filter=lfs diff=lfs merge=lfs -text
47
+ examples/source_image/art_5.png filter=lfs diff=lfs merge=lfs -text
48
+ examples/source_image/art_8.png filter=lfs diff=lfs merge=lfs -text
49
+ examples/source_image/art_9.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.pkl
2
+ *.jpg
3
+ *.mp4
4
+ *.pth
5
+ *.pyc
6
+ __pycache__
7
+ *.h5
8
+ *.avi
9
+ *.wav
10
+ filelists/*.txt
11
+ evaluation/test_filelists/lr*.txt
12
+ *.pyc
13
+ *.mkv
14
+ *.gif
15
+ *.webm
16
+ *.mp3
0.jpg ADDED
1.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #=======推理代码. 参数都已经设置好了,直接跑即可. 结果再results/result_voice.mp4里面.
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+ from os import listdir, path
10
+ import numpy as np
11
+ import scipy, cv2, os, sys, argparse, audio
12
+ import json, subprocess, random, string
13
+ from tqdm import tqdm
14
+ from glob import glob
15
+ import torch, face_detection
16
+ from models import Wav2Lip
17
+ import platform
18
+
19
+ parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')
20
+
21
+ parser.add_argument('--checkpoint_path', type=str,
22
+ help='Name of saved checkpoint to load weights from', required=False)
23
+
24
+ parser.add_argument('--face', type=str,
25
+ help='Filepath of video/image that contains faces to use', required=False)
26
+ parser.add_argument('--audio', type=str,
27
+ help='Filepath of video/audio file to use as raw audio source', required=False)
28
+ parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.',
29
+ default='results/result_voice.mp4')
30
+
31
+ parser.add_argument('--static', type=bool,
32
+ help='If True, then use only first video frame for inference', default=False)
33
+ parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)',
34
+ default=25., required=False)
35
+
36
+ parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0],
37
+ help='Padding (top, bottom, left, right). Please adjust to include chin at least')
38
+
39
+ parser.add_argument('--face_det_batch_size', type=int,
40
+ help='Batch size for face detection', default=16)
41
+ parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=128)
42
+
43
+ parser.add_argument('--resize_factor', default=1, type=int,
44
+ help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')
45
+
46
+ parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1],
47
+ help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. '
48
+ 'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')
49
+
50
+ parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1],
51
+ help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
52
+ 'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')
53
+
54
+ parser.add_argument('--rotate', default=False, action='store_true',
55
+ help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.'
56
+ 'Use if you get a flipped result, despite feeding a normal looking video')
57
+
58
+ parser.add_argument('--nosmooth', default=False, action='store_true',
59
+ help='Prevent smoothing face detections over a short temporal window')
60
+
61
+ args = parser.parse_args()
62
+ args.img_size = 96
63
+ args.checkpoint_path = 'checkpoints/wav2lip_gan.pth'
64
+ args.face = 'examples/source_image/full4.jpeg'
65
+ args.audio = 'examples/driven_audio/itosinger1.wav'
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+ if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
81
+ args.static = True
82
+
83
+ def get_smoothened_boxes(boxes, T):
84
+ for i in range(len(boxes)):
85
+ if i + T > len(boxes):
86
+ window = boxes[len(boxes) - T:]
87
+ else:
88
+ window = boxes[i : i + T]
89
+ boxes[i] = np.mean(window, axis=0)
90
+ return boxes
91
+
92
+ def face_detect(images):
93
+ detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
94
+ flip_input=False, device=device)
95
+
96
+ batch_size = args.face_det_batch_size
97
+
98
+ while 1:
99
+ predictions = []
100
+ try:
101
+ for i in tqdm(range(0, len(images), batch_size)):
102
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
103
+ except RuntimeError:
104
+ if batch_size == 1:
105
+ raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
106
+ batch_size //= 2
107
+ print('Recovering from OOM error; New batch size: {}'.format(batch_size))
108
+ continue
109
+ break
110
+
111
+ results = []
112
+ pady1, pady2, padx1, padx2 = args.pads
113
+ for rect, image in zip(predictions, images):
114
+ if rect is None:
115
+ cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
116
+ raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
117
+
118
+ y1 = max(0, rect[1] - pady1)
119
+ y2 = min(image.shape[0], rect[3] + pady2)
120
+ x1 = max(0, rect[0] - padx1)
121
+ x2 = min(image.shape[1], rect[2] + padx2)
122
+
123
+ results.append([x1, y1, x2, y2])
124
+
125
+ boxes = np.array(results)
126
+ if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
127
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
128
+
129
+ del detector
130
+ return results
131
+
132
+ def datagen(frames, mels):
133
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
134
+
135
+ if args.box[0] == -1:
136
+ if not args.static:
137
+ face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
138
+ else:
139
+ face_det_results = face_detect([frames[0]])
140
+ else:
141
+ print('Using the specified bounding box instead of face detection...')
142
+ y1, y2, x1, x2 = args.box
143
+ face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
144
+
145
+ for i, m in enumerate(mels):
146
+ idx = 0 if args.static else i%len(frames) # 静态就是每个图片都是第一针.
147
+ frame_to_save = frames[idx].copy()
148
+ face, coords = face_det_results[idx].copy()
149
+
150
+ face = cv2.resize(face, (args.img_size, args.img_size))
151
+
152
+ img_batch.append(face)
153
+ mel_batch.append(m)
154
+ frame_batch.append(frame_to_save)
155
+ coords_batch.append(coords)
156
+
157
+ if len(img_batch) >= args.wav2lip_batch_size:
158
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
159
+
160
+ img_masked = img_batch.copy()
161
+ img_masked[:, args.img_size//2:] = 0
162
+
163
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
164
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
165
+
166
+ yield img_batch, mel_batch, frame_batch, coords_batch
167
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
168
+
169
+ if len(img_batch) > 0:
170
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
171
+
172
+ img_masked = img_batch.copy()
173
+ img_masked[:, args.img_size//2:] = 0
174
+
175
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
176
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
177
+
178
+ yield img_batch, mel_batch, frame_batch, coords_batch
179
+
180
+ mel_step_size = 16
181
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
182
+ print('Using {} for inference.'.format(device))
183
+
184
+ def _load(checkpoint_path):
185
+ if device == 'cuda':
186
+ checkpoint = torch.load(checkpoint_path)
187
+ else:
188
+ checkpoint = torch.load(checkpoint_path,
189
+ map_location=lambda storage, loc: storage)
190
+ return checkpoint
191
+
192
+ def load_model(path):
193
+ model = Wav2Lip()
194
+ print("Load checkpoint from: {}".format(path))
195
+ checkpoint = _load(path)
196
+ s = checkpoint["state_dict"]
197
+ new_s = {}
198
+ for k, v in s.items():
199
+ new_s[k.replace('module.', '')] = v
200
+ model.load_state_dict(new_s)
201
+
202
+ model = model.to(device)
203
+ return model.eval()
204
+
205
+ def main():
206
+ if not os.path.isfile(args.face):
207
+ raise ValueError('--face argument must be a valid path to video/image file')
208
+
209
+ elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
210
+ full_frames = [cv2.imread(args.face)]
211
+ fps = args.fps
212
+
213
+ else:
214
+ video_stream = cv2.VideoCapture(args.face)
215
+ fps = video_stream.get(cv2.CAP_PROP_FPS)
216
+
217
+ print('Reading video frames...')
218
+
219
+ full_frames = []
220
+ while 1:
221
+ still_reading, frame = video_stream.read()
222
+ if not still_reading:
223
+ video_stream.release()
224
+ break
225
+ if args.resize_factor > 1:
226
+ frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))
227
+
228
+ if args.rotate:
229
+ frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
230
+
231
+ y1, y2, x1, x2 = args.crop
232
+ if x2 == -1: x2 = frame.shape[1]
233
+ if y2 == -1: y2 = frame.shape[0]
234
+
235
+ frame = frame[y1:y2, x1:x2]
236
+
237
+ full_frames.append(frame)
238
+
239
+ print ("Number of frames available for inference: "+str(len(full_frames)))
240
+
241
+ if not args.audio.endswith('.wav'):
242
+ print('Extracting raw audio...')
243
+ command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav')
244
+
245
+ subprocess.call(command, shell=True)
246
+ args.audio = 'temp/temp.wav'
247
+
248
+ wav = audio.load_wav(args.audio, 16000)
249
+ mel = audio.melspectrogram(wav)
250
+ print(mel.shape)
251
+
252
+ if np.isnan(mel.reshape(-1)).sum() > 0:
253
+ raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
254
+
255
+ mel_chunks = []
256
+ mel_idx_multiplier = 80./fps # 梅尔一秒是80, fps是我们采样率.
257
+ i = 0
258
+ while 1:
259
+ start_idx = int(i * mel_idx_multiplier)
260
+ if start_idx + mel_step_size > len(mel[0]): # 每16个mel取一个区间.
261
+ mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
262
+ break
263
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
264
+ i += 1
265
+
266
+ print("Length of mel chunks: {}".format(len(mel_chunks)))
267
+
268
+ full_frames = full_frames[:len(mel_chunks)]
269
+
270
+ batch_size = args.wav2lip_batch_size
271
+ gen = datagen(full_frames.copy(), mel_chunks)
272
+
273
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen,
274
+ total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
275
+ if i == 0:
276
+ model = load_model(args.checkpoint_path)
277
+ print ("Model loaded")
278
+
279
+ frame_h, frame_w = full_frames[0].shape[:-1]
280
+ out = cv2.VideoWriter('temp/result.avi',
281
+ cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
282
+
283
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
284
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
285
+
286
+ with torch.no_grad():
287
+ pred = model(mel_batch, img_batch)
288
+
289
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
290
+ cnt=1
291
+ for p, f, c in zip(pred, frames, coords):
292
+ y1, y2, x1, x2 = c
293
+ p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
294
+ cv2.imwrite(f'temp/{cnt}.png',p)
295
+ f[y1:y2, x1:x2] = p #======新的脸部贴上.
296
+ out.write(f)
297
+ cnt+=1
298
+
299
+ out.release()
300
+
301
+ command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(args.audio, 'temp/result.avi', args.outfile)
302
+ subprocess.call(command, shell=platform.system() != 'Windows')
303
+
304
+ if __name__ == '__main__':
305
+ main()
2.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 学习一下预处理代码. 把视频的每一个帧都抽出来. 和音频.
2
+
3
+ import sys
4
+
5
+ if sys.version_info[0] < 3 and sys.version_info[1] < 2:
6
+ raise Exception("Must be using >= Python 3.2")
7
+
8
+ from os import listdir, path
9
+
10
+ if not path.isfile('face_detection/detection/sfd/s3fd.pth'):
11
+ raise FileNotFoundError('Save the s3fd model to face_detection/detection/sfd/s3fd.pth \
12
+ before running this script!')
13
+
14
+ import multiprocessing as mp
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
16
+ import numpy as np
17
+ import argparse, os, cv2, traceback, subprocess
18
+ from tqdm import tqdm
19
+ from glob import glob
20
+ import audio
21
+ from hparams import hparams as hp
22
+
23
+ import face_detection
24
+
25
+ parser = argparse.ArgumentParser()
26
+
27
+ parser.add_argument('--ngpu', help='Number of GPUs across which to run in parallel', default=1, type=int)
28
+ parser.add_argument('--batch_size', help='Single GPU Face detection batch size', default=32, type=int)
29
+ parser.add_argument("--data_root", help="Root folder of the LRS2 dataset", required=False)
30
+ parser.add_argument("--preprocessed_root", help="Root folder of the preprocessed dataset", required=False)
31
+
32
+ args = parser.parse_args()
33
+ args.data_root='filelists'
34
+ args.preprocessed_root='lrs2_preprocessed'
35
+
36
+
37
+
38
+
39
+
40
+
41
+ # fa = [face_detection.FaceAlignment(face_detection.LandmarksType._2D, flip_input=False,
42
+ # device='cuda:{}'.format(id)) for id in range(args.ngpu)]
43
+ fa = [face_detection.FaceAlignment(face_detection.LandmarksType._2D, flip_input=False,
44
+ device='cpu') for id in range(args.ngpu)]
45
+ template = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'
46
+ # template2 = 'ffmpeg -hide_banner -loglevel panic -threads 1 -y -i {} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {}'
47
+
48
+ def process_video_file(vfile, args, gpu_id):
49
+ video_stream = cv2.VideoCapture(vfile)
50
+
51
+ frames = []
52
+ while 1:
53
+ still_reading, frame = video_stream.read()
54
+ if not still_reading:
55
+ video_stream.release()
56
+ break
57
+ frames.append(frame)
58
+
59
+ vidname = os.path.basename(vfile).split('.')[0]
60
+ dirname = vfile.split('/')[-2]
61
+
62
+ fulldir = path.join(args.preprocessed_root, dirname, vidname)
63
+ os.makedirs(fulldir, exist_ok=True)
64
+
65
+ batches = [frames[i:i + args.batch_size] for i in range(0, len(frames), args.batch_size)]
66
+
67
+ i = -1
68
+ for fb in batches:
69
+ preds = fa[gpu_id].get_detections_for_batch(np.asarray(fb))
70
+
71
+ for j, f in enumerate(preds):
72
+ i += 1
73
+ if f is None:
74
+ continue
75
+
76
+ x1, y1, x2, y2 = f
77
+ cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), fb[j][y1:y2, x1:x2])
78
+
79
+ def process_audio_file(vfile, args):
80
+ vidname = os.path.basename(vfile).split('.')[0]
81
+ dirname = vfile.split('/')[-2]
82
+
83
+ fulldir = path.join(args.preprocessed_root, dirname, vidname)
84
+ os.makedirs(fulldir, exist_ok=True)
85
+
86
+ wavpath = path.join(fulldir, 'audio.wav')
87
+
88
+ command = template.format(vfile, wavpath)
89
+ subprocess.call(command, shell=True)
90
+
91
+
92
+ def mp_handler(job):
93
+ vfile, args, gpu_id = job
94
+ try:
95
+ process_video_file(vfile, args, gpu_id)
96
+ except KeyboardInterrupt:
97
+ exit(0)
98
+ except:
99
+ traceback.print_exc()
100
+
101
+
102
+ # print('Started processing for {} with {} GPUs'.format(args.data_root, args.ngpu))
103
+
104
+ filelist = glob(path.join(args.data_root, '*/*.mp4'))
105
+
106
+ jobs = [(vfile, args, i%args.ngpu) for i, vfile in enumerate(filelist)]
107
+ p = ThreadPoolExecutor(args.ngpu)
108
+
109
+
110
+
111
+ futures=[ mp_handler(i) for i in jobs]
112
+
113
+
114
+ # _ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
115
+
116
+ print('Dumping audios...')
117
+
118
+ for vfile in tqdm(filelist):
119
+ try:
120
+ process_audio_file(vfile, args)
121
+ except KeyboardInterrupt:
122
+ exit(0)
123
+ except:
124
+ traceback.print_exc()
125
+ continue
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+ raise
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
3.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 先需要训练一个syncnet给4.py用. 目标训练一个音频视频是否同步的分类器. 输出是不是同步的概率.
2
+ import trl
3
+
4
+ from os.path import dirname, join, basename, isfile
5
+ from tqdm import tqdm
6
+
7
+ from models import SyncNet_color as SyncNet
8
+ import audio
9
+
10
+ import torch
11
+ from torch import nn
12
+ from torch import optim
13
+ import torch.backends.cudnn as cudnn
14
+ from torch.utils import data as data_utils
15
+ import numpy as np
16
+
17
+ from glob import glob
18
+
19
+ import os, random, cv2, argparse
20
+ from hparams import hparams, get_image_list
21
+
22
+ parser = argparse.ArgumentParser(description='Code to train the expert lip-sync discriminator')
23
+
24
+ parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=False)
25
+
26
+ parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=False, type=str)
27
+ parser.add_argument('--checkpoint_path', help='Resumed from this checkpoint', default=None, type=str)
28
+
29
+ args = parser.parse_args()
30
+ args.data_root='lrs2_preprocessed/LRS2_partly'
31
+ args.checkpoint_dir='./tmp2'
32
+
33
+ global_step = 0
34
+ global_epoch = 0
35
+ use_cuda = torch.cuda.is_available()
36
+ print('use_cuda: {}'.format(use_cuda))
37
+
38
+ syncnet_T = 5
39
+ syncnet_mel_step_size = 16
40
+
41
+ class Dataset(object):
42
+ def __init__(self, split):
43
+ # self.all_videos = get_image_list(args.data_root, split)
44
+ self.all_videos =glob('lrs2_preprocessed/LRS2_partly/*')
45
+ print(self.all_videos)
46
+ def get_frame_id(self, frame):
47
+ return int(basename(frame).split('.')[0])
48
+
49
+ def get_window(self, start_frame):
50
+ start_id = self.get_frame_id(start_frame)
51
+ vidname = dirname(start_frame)
52
+
53
+ window_fnames = []
54
+ for frame_id in range(start_id, start_id + syncnet_T):
55
+ frame = join(vidname, '{}.jpg'.format(frame_id))
56
+ if not isfile(frame):
57
+ return None
58
+ window_fnames.append(frame)
59
+ return window_fnames
60
+
61
+ def crop_audio_window(self, spec, start_frame):
62
+ # num_frames = (T x hop_size * fps) / sample_rate
63
+ start_frame_num = self.get_frame_id(start_frame)
64
+ start_idx = int(80. * (start_frame_num / float(hparams.fps)))
65
+
66
+ end_idx = start_idx + syncnet_mel_step_size
67
+
68
+ return spec[start_idx : end_idx, :]
69
+
70
+
71
+ def __len__(self):
72
+ return len(self.all_videos)
73
+
74
+ def __getitem__(self, idx):
75
+ while 1:
76
+ idx = random.randint(0, len(self.all_videos) - 1)
77
+ vidname = self.all_videos[idx] # 随便抽取一个视频.
78
+
79
+ img_names = list(glob(join(vidname, '*.jpg')))
80
+ if len(img_names) <= 3 * syncnet_T:
81
+ continue
82
+ img_name = random.choice(img_names)
83
+ wrong_img_name = random.choice(img_names)
84
+ while wrong_img_name == img_name:
85
+ wrong_img_name = random.choice(img_names)
86
+ #选一个真或者假照片.
87
+ if random.choice([True, False]):
88
+ y = torch.ones(1).float()
89
+ chosen = img_name
90
+ else:
91
+ y = torch.zeros(1).float()
92
+ chosen = wrong_img_name
93
+
94
+ window_fnames = self.get_window(chosen)
95
+ if window_fnames is None:
96
+ continue
97
+
98
+ window = []
99
+ all_read = True
100
+ for fname in window_fnames:
101
+ img = cv2.imread(fname)
102
+ if img is None:
103
+ all_read = False
104
+ break
105
+ try:
106
+ img = cv2.resize(img, (hparams.img_size, hparams.img_size))
107
+ except Exception as e:
108
+ all_read = False
109
+ break
110
+
111
+ window.append(img)
112
+
113
+ if not all_read: continue
114
+
115
+ try:
116
+ wavpath = join(vidname, "audio.wav")
117
+ wav = audio.load_wav(wavpath, hparams.sample_rate)
118
+
119
+ orig_mel = audio.melspectrogram(wav).T
120
+ except Exception as e:
121
+ continue
122
+
123
+ mel = self.crop_audio_window(orig_mel.copy(), img_name)
124
+
125
+ if (mel.shape[0] != syncnet_mel_step_size):
126
+ continue
127
+
128
+ # H x W x 3 * T
129
+ x = np.concatenate(window, axis=2) / 255.
130
+ x = x.transpose(2, 0, 1)
131
+ x = x[:, x.shape[1]//2:] #################????????????????????????????????????????????????????为啥要切一半呢?????????????????我理解是人脸嘴的部分一定在图片的下半部分, 所以去掉上面, 会加速网络收敛.
132
+
133
+ x = torch.FloatTensor(x)
134
+ mel = torch.FloatTensor(mel.T).unsqueeze(0)
135
+
136
+ return x, mel, y
137
+
138
+ logloss = nn.BCELoss()
139
+ def cosine_loss(a, v, y):
140
+ d = nn.functional.cosine_similarity(a, v)
141
+ loss = logloss(d.unsqueeze(1), y)
142
+
143
+ return loss
144
+
145
+ def train(device, model, train_data_loader, test_data_loader, optimizer,
146
+ checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
147
+
148
+ global global_step, global_epoch
149
+ resumed_step = global_step
150
+
151
+ while global_epoch < nepochs:
152
+ running_loss = 0.
153
+ prog_bar = tqdm(enumerate(train_data_loader))
154
+ for step, (x, mel, y) in prog_bar:
155
+ model.train()
156
+ optimizer.zero_grad()
157
+
158
+ # Transform data to CUDA device
159
+ x = x.to(device)
160
+
161
+ mel = mel.to(device)
162
+
163
+ a, v = model(mel, x)
164
+ y = y.to(device)
165
+
166
+ loss = cosine_loss(a, v, y)
167
+ loss.backward()
168
+ optimizer.step()
169
+
170
+ global_step += 1
171
+ cur_session_steps = global_step - resumed_step
172
+ running_loss += loss.item()
173
+
174
+ if global_step == 1 or global_step % checkpoint_interval == 0:
175
+ save_checkpoint(
176
+ model, optimizer, global_step, checkpoint_dir, global_epoch)
177
+
178
+ if global_step % hparams.syncnet_eval_interval == 0:
179
+ with torch.no_grad():
180
+ eval_model(test_data_loader, global_step, device, model, checkpoint_dir)
181
+
182
+ prog_bar.set_description('Loss: {}'.format(running_loss / (step + 1)))
183
+
184
+ global_epoch += 1
185
+
186
+ def eval_model(test_data_loader, global_step, device, model, checkpoint_dir):
187
+ eval_steps = 1400
188
+ print('Evaluating for {} steps'.format(eval_steps))
189
+ losses = []
190
+ while 1:
191
+ for step, (x, mel, y) in enumerate(test_data_loader):
192
+
193
+ model.eval()
194
+
195
+ # Transform data to CUDA device
196
+ x = x.to(device)
197
+
198
+ mel = mel.to(device)
199
+
200
+ a, v = model(mel, x)
201
+ y = y.to(device)
202
+
203
+ loss = cosine_loss(a, v, y)
204
+ losses.append(loss.item())
205
+
206
+ if step > eval_steps: break
207
+
208
+ averaged_loss = sum(losses) / len(losses)
209
+ print(averaged_loss)
210
+
211
+ return
212
+
213
+ def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch):
214
+
215
+ checkpoint_path = join(
216
+ checkpoint_dir, "checkpoint_step{:09d}.pth".format(global_step))
217
+ optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
218
+ torch.save({
219
+ "state_dict": model.state_dict(),
220
+ "optimizer": optimizer_state,
221
+ "global_step": step,
222
+ "global_epoch": epoch,
223
+ }, checkpoint_path)
224
+ print("Saved checkpoint:", checkpoint_path)
225
+
226
+ def _load(checkpoint_path):
227
+ if use_cuda:
228
+ checkpoint = torch.load(checkpoint_path)
229
+ else:
230
+ checkpoint = torch.load(checkpoint_path,
231
+ map_location=lambda storage, loc: storage)
232
+ return checkpoint
233
+
234
+ def load_checkpoint(path, model, optimizer, reset_optimizer=False):
235
+ global global_step
236
+ global global_epoch
237
+
238
+ print("Load checkpoint from: {}".format(path))
239
+ checkpoint = _load(path)
240
+ model.load_state_dict(checkpoint["state_dict"])
241
+ if not reset_optimizer:
242
+ optimizer_state = checkpoint["optimizer"]
243
+ if optimizer_state is not None:
244
+ print("Load optimizer state from {}".format(path))
245
+ optimizer.load_state_dict(checkpoint["optimizer"])
246
+ global_step = checkpoint["global_step"]
247
+ global_epoch = checkpoint["global_epoch"]
248
+
249
+ return model
250
+
251
+ if __name__ == "__main__":
252
+ checkpoint_dir = args.checkpoint_dir
253
+ checkpoint_path = args.checkpoint_path
254
+
255
+ if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir)
256
+
257
+ # Dataset and Dataloader setup
258
+ train_dataset = Dataset('train')
259
+ test_dataset = Dataset('val')
260
+
261
+ train_data_loader = data_utils.DataLoader(
262
+ train_dataset, batch_size=hparams.syncnet_batch_size, shuffle=True,
263
+ num_workers=hparams.num_workers)
264
+
265
+ test_data_loader = data_utils.DataLoader(
266
+ test_dataset, batch_size=hparams.syncnet_batch_size,
267
+ num_workers=8)
268
+
269
+ device = torch.device("cuda" if use_cuda else "cpu")
270
+
271
+ # Model
272
+ model = SyncNet().to(device)
273
+ print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
274
+
275
+ optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
276
+ lr=hparams.syncnet_lr)
277
+
278
+ if checkpoint_path is not None:
279
+ load_checkpoint(checkpoint_path, model, optimizer, reset_optimizer=False)
280
+
281
+ train(device, model, train_data_loader, test_data_loader, optimizer,
282
+ checkpoint_dir=checkpoint_dir,
283
+ checkpoint_interval=hparams.syncnet_checkpoint_interval,
284
+ nepochs=hparams.nepochs)
4.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 学习训练代码.
2
+ import glob
3
+ # print(glob.glob('./lrs2_preprocessed/LRS2_partly/*/*.jpg'))
4
+ # print(glob.glob('./*.*'))
5
+
6
+ from os.path import dirname, join, basename, isfile
7
+ from tqdm import tqdm
8
+
9
+ from models import SyncNet_color as SyncNet
10
+ from models import Wav2Lip as Wav2Lip
11
+ import audio
12
+
13
+ import torch
14
+ from torch import nn
15
+ from torch import optim
16
+ import torch.backends.cudnn as cudnn
17
+ from torch.utils import data as data_utils
18
+ import numpy as np
19
+
20
+ from glob import glob
21
+
22
+ import os, random, cv2, argparse
23
+ from hparams import hparams, get_image_list
24
+
25
+ parser = argparse.ArgumentParser(description='Code to train the Wav2Lip model without the visual quality discriminator')
26
+
27
+ parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=False, type=str)
28
+
29
+ parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=False, type=str)
30
+ parser.add_argument('--syncnet_checkpoint_path', help='Load the pre-trained Expert discriminator', required=False, type=str)
31
+
32
+ parser.add_argument('--checkpoint_path', help='Resume from this checkpoint', default=None, type=str)
33
+
34
+ args = parser.parse_args()
35
+ args.data_root='lrs2_preprocessed'
36
+ args.checkpoint_dir='newmodel'
37
+ args.syncnet_checkpoint_path='checkpoints/lipsync_expert.pth'
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+ global_step = 0
52
+ global_epoch = 0
53
+ use_cuda = torch.cuda.is_available()
54
+ print('use_cuda: {}'.format(use_cuda))
55
+
56
+ syncnet_T = 5
57
+ syncnet_mel_step_size = 16
58
+
59
+ class Dataset(object):
60
+ def __init__(self, split):
61
+ # self.all_videos = get_image_list(args.data_root, split)
62
+ self.all_videos =glob('lrs2_preprocessed/LRS2_partly/*')
63
+ print(self.all_videos)
64
+ def get_frame_id(self, frame):
65
+ return int(basename(frame).split('.')[0])
66
+
67
+ def get_window(self, start_frame):
68
+ start_id = self.get_frame_id(start_frame)
69
+ vidname = dirname(start_frame)
70
+
71
+ window_fnames = []
72
+ for frame_id in range(start_id, start_id + syncnet_T):
73
+ frame = join(vidname, '{}.jpg'.format(frame_id)) # 取5帧
74
+ if not isfile(frame):
75
+ return None
76
+ window_fnames.append(frame)
77
+ return window_fnames
78
+
79
+ def read_window(self, window_fnames):
80
+ if window_fnames is None: return None
81
+ window = []
82
+ for fname in window_fnames:
83
+ img = cv2.imread(fname)
84
+ if img is None:
85
+ return None
86
+ try:
87
+ img = cv2.resize(img, (hparams.img_size, hparams.img_size))
88
+ except Exception as e:
89
+ return None
90
+
91
+ window.append(img)
92
+
93
+ return window
94
+
95
+ def crop_audio_window(self, spec, start_frame):
96
+ if type(start_frame) == int:
97
+ start_frame_num = start_frame
98
+ else:
99
+ start_frame_num = self.get_frame_id(start_frame) # 0-indexing ---> 1-indexing
100
+ start_idx = int(80. * (start_frame_num / float(hparams.fps)))
101
+ # mel普一秒是80个数, (start_frame_num / float(hparams.fps) 当前流逝时间.
102
+ end_idx = start_idx + syncnet_mel_step_size
103
+
104
+ return spec[start_idx : end_idx, :]
105
+
106
+ def get_segmented_mels(self, spec, start_frame):
107
+ mels = []
108
+ assert syncnet_T == 5
109
+ start_frame_num = self.get_frame_id(start_frame) + 1 # 0-indexing ---> 1-indexing
110
+ if start_frame_num - 2 < 0: return None
111
+ for i in range(start_frame_num, start_frame_num + syncnet_T):
112
+ m = self.crop_audio_window(spec, i - 2)
113
+ if m.shape[0] != syncnet_mel_step_size:
114
+ return None
115
+ mels.append(m.T)
116
+
117
+ mels = np.asarray(mels)
118
+
119
+ return mels
120
+
121
+ def prepare_window(self, window):
122
+ # 3 x T x H x W
123
+ x = np.asarray(window) / 255.
124
+ x = np.transpose(x, (3, 0, 1, 2))
125
+
126
+ return x
127
+
128
+ def __len__(self):
129
+ return len(self.all_videos)
130
+
131
+ def __getitem__(self, idx):
132
+ while 1:
133
+ idx = random.randint(0, len(self.all_videos) - 1)
134
+ vidname = self.all_videos[idx]
135
+ img_names = list(glob(join(vidname, '*.jpg')))
136
+ if len(img_names) <= 3 * syncnet_T:
137
+ continue
138
+
139
+ img_name = random.choice(img_names)
140
+ wrong_img_name = random.choice(img_names)
141
+ while wrong_img_name == img_name:
142
+ wrong_img_name = random.choice(img_names)
143
+
144
+ window_fnames = self.get_window(img_name)
145
+ wrong_window_fnames = self.get_window(wrong_img_name)
146
+ if window_fnames is None or wrong_window_fnames is None:
147
+ continue
148
+
149
+ window = self.read_window(window_fnames)
150
+ if window is None:
151
+ continue
152
+
153
+ wrong_window = self.read_window(wrong_window_fnames)
154
+ if wrong_window is None:
155
+ continue
156
+
157
+ try:
158
+ wavpath = join(vidname, "audio.wav")
159
+ wav = audio.load_wav(wavpath, hparams.sample_rate)
160
+
161
+ orig_mel = audio.melspectrogram(wav).T
162
+ except Exception as e:
163
+ continue
164
+ # mel 当前时间点, 0.2秒钟的音频特征.
165
+ mel = self.crop_audio_window(orig_mel.copy(), img_name)
166
+
167
+ if (mel.shape[0] != syncnet_mel_step_size):
168
+ continue
169
+ # indiv_mels是上一个mel的后续5个.
170
+ indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name)
171
+ if indiv_mels is None: continue
172
+
173
+ window = self.prepare_window(window)
174
+ y = window.copy() #======作为标签.
175
+ window[:, :, window.shape[2]//2:] = 0.
176
+ # window: 34帧到39帧, mel:34的音频. inv_mel: 35到39的音频. wrong_window: 58到62的帧.
177
+ wrong_window = self.prepare_window(wrong_window)
178
+ x = np.concatenate([window, wrong_window], axis=0)
179
+
180
+ x = torch.FloatTensor(x)
181
+ mel = torch.FloatTensor(mel.T).unsqueeze(0)
182
+ indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1)
183
+ y = torch.FloatTensor(y)
184
+ return x, indiv_mels, mel, y
185
+
186
+ def save_sample_images(x, g, gt, global_step, checkpoint_dir):
187
+ x = (x.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
188
+ g = (g.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
189
+ gt = (gt.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
190
+
191
+ refs, inps = x[..., 3:], x[..., :3]
192
+ folder = join(checkpoint_dir, "samples_step{:09d}".format(global_step))
193
+ if not os.path.exists(folder): os.mkdir(folder)
194
+ collage = np.concatenate((refs, inps, g, gt), axis=-2)
195
+ for batch_idx, c in enumerate(collage):
196
+ for t in range(len(c)):
197
+ cv2.imwrite('{}/{}_{}.jpg'.format(folder, batch_idx, t), c[t])
198
+
199
+ logloss = nn.BCELoss()
200
+ def cosine_loss(a, v, y):
201
+ d = nn.functional.cosine_similarity(a, v)
202
+ loss = logloss(d.unsqueeze(1), y)
203
+
204
+ return loss
205
+
206
+ device = torch.device("cuda" if use_cuda else "cpu")
207
+ syncnet = SyncNet().to(device)
208
+ for p in syncnet.parameters():
209
+ p.requires_grad = False
210
+
211
+ recon_loss = nn.L1Loss()
212
+ def get_sync_loss(mel, g):
213
+ g = g[:, :, :, g.size(3)//2:]
214
+ g = torch.cat([g[:, :, i] for i in range(syncnet_T)], dim=1)
215
+ # B, 3 * T, H//2, W
216
+ a, v = syncnet(mel, g)
217
+ y = torch.ones(g.size(0), 1).float().to(device)
218
+ return cosine_loss(a, v, y)
219
+
220
+ def train(device, model, train_data_loader, test_data_loader, optimizer,
221
+ checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
222
+
223
+ global global_step, global_epoch
224
+ resumed_step = global_step
225
+
226
+ while global_epoch < nepochs:
227
+ print('Starting Epoch: {}'.format(global_epoch))
228
+ running_sync_loss, running_l1_loss = 0., 0.5
229
+ prog_bar = tqdm(enumerate(train_data_loader))
230
+ for step, (x, indiv_mels, mel, gt) in prog_bar:
231
+ model.train()
232
+ optimizer.zero_grad()
233
+
234
+ # Move data to CUDA device
235
+ x = x.to(device)
236
+ mel = mel.to(device)
237
+ indiv_mels = indiv_mels.to(device)
238
+ gt = gt.to(device)
239
+
240
+ g = model(indiv_mels, x) # 输入后续mel和当前图片, 计算后续的图片.
241
+
242
+ if hparams.syncnet_wt > 0.:
243
+ sync_loss = get_sync_loss(mel, g)
244
+ else:
245
+ sync_loss = 0.
246
+
247
+ l1loss = recon_loss(g, gt)
248
+
249
+ loss = hparams.syncnet_wt * sync_loss + (1 - hparams.syncnet_wt) * l1loss
250
+ loss.backward()
251
+ optimizer.step()
252
+
253
+ if global_step % checkpoint_interval == 0:
254
+ save_sample_images(x, g, gt, global_step, checkpoint_dir)
255
+
256
+ global_step += 1
257
+ cur_session_steps = global_step - resumed_step
258
+
259
+ running_l1_loss += l1loss.item()
260
+ if hparams.syncnet_wt > 0.:
261
+ running_sync_loss += sync_loss.item()
262
+ else:
263
+ running_sync_loss += 0.
264
+
265
+ if global_step == 1 or global_step % checkpoint_interval == 0:
266
+ save_checkpoint(
267
+ model, optimizer, global_step, checkpoint_dir, global_epoch)
268
+
269
+ if global_step == 1 or global_step % hparams.eval_interval == 0:
270
+ with torch.no_grad():
271
+ average_sync_loss = eval_model(test_data_loader, global_step, device, model, checkpoint_dir)
272
+
273
+ if average_sync_loss < .75:
274
+ hparams.set_hparam('syncnet_wt', 0.01) # without image GAN a lesser weight is sufficient
275
+
276
+ prog_bar.set_description('L1: {}, Sync Loss: {}'.format(running_l1_loss / (step + 1),
277
+ running_sync_loss / (step + 1)))
278
+
279
+ global_epoch += 1
280
+
281
+
282
+ def eval_model(test_data_loader, global_step, device, model, checkpoint_dir):
283
+ eval_steps = 700
284
+ print('Evaluating for {} steps'.format(eval_steps))
285
+ sync_losses, recon_losses = [], []
286
+ step = 0
287
+ while 1:
288
+ for x, indiv_mels, mel, gt in test_data_loader:
289
+ step += 1
290
+ model.eval()
291
+
292
+ # Move data to CUDA device
293
+ x = x.to(device)
294
+ gt = gt.to(device)
295
+ indiv_mels = indiv_mels.to(device)
296
+ mel = mel.to(device)
297
+
298
+ g = model(indiv_mels, x)
299
+
300
+ sync_loss = get_sync_loss(mel, g)
301
+ l1loss = recon_loss(g, gt)
302
+
303
+ sync_losses.append(sync_loss.item())
304
+ recon_losses.append(l1loss.item())
305
+
306
+ if step > eval_steps:
307
+ averaged_sync_loss = sum(sync_losses) / len(sync_losses)
308
+ averaged_recon_loss = sum(recon_losses) / len(recon_losses)
309
+
310
+ print('L1: {}, Sync loss: {}'.format(averaged_recon_loss, averaged_sync_loss))
311
+
312
+ return averaged_sync_loss
313
+
314
+ def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch):
315
+
316
+ checkpoint_path = join(
317
+ checkpoint_dir, "checkpoint_step{:09d}.pth".format(global_step))
318
+ optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
319
+ torch.save({
320
+ "state_dict": model.state_dict(),
321
+ "optimizer": optimizer_state,
322
+ "global_step": step,
323
+ "global_epoch": epoch,
324
+ }, checkpoint_path)
325
+ print("Saved checkpoint:", checkpoint_path)
326
+
327
+
328
+ def _load(checkpoint_path):
329
+ if use_cuda:
330
+ checkpoint = torch.load(checkpoint_path)
331
+ else:
332
+ checkpoint = torch.load(checkpoint_path,
333
+ map_location=lambda storage, loc: storage)
334
+ return checkpoint
335
+
336
+ def load_checkpoint(path, model, optimizer, reset_optimizer=False, overwrite_global_states=True):
337
+ global global_step
338
+ global global_epoch
339
+
340
+ print("Load checkpoint from: {}".format(path))
341
+ checkpoint = _load(path)
342
+ s = checkpoint["state_dict"]
343
+ new_s = {}
344
+ for k, v in s.items():
345
+ new_s[k.replace('module.', '')] = v
346
+ model.load_state_dict(new_s)
347
+ if not reset_optimizer:
348
+ optimizer_state = checkpoint["optimizer"]
349
+ if optimizer_state is not None:
350
+ print("Load optimizer state from {}".format(path))
351
+ optimizer.load_state_dict(checkpoint["optimizer"])
352
+ if overwrite_global_states:
353
+ global_step = checkpoint["global_step"]
354
+ global_epoch = checkpoint["global_epoch"]
355
+
356
+ return model
357
+
358
+ if __name__ == "__main__":
359
+ checkpoint_dir = args.checkpoint_dir
360
+
361
+ # Dataset and Dataloader setup
362
+ train_dataset = Dataset('train')
363
+ test_dataset = Dataset('val')
364
+ hparams.batch_size=1
365
+ hparams.num_workers=0
366
+ train_data_loader = data_utils.DataLoader(
367
+ train_dataset, batch_size=hparams.batch_size, shuffle=True,
368
+ num_workers=hparams.num_workers)
369
+
370
+ test_data_loader = data_utils.DataLoader(
371
+ test_dataset, batch_size=hparams.batch_size,
372
+ num_workers=hparams.num_workers)
373
+
374
+ device = torch.device("cuda" if use_cuda else "cpu")
375
+
376
+ # Model
377
+ model = Wav2Lip().to(device)
378
+ print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
379
+
380
+ optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
381
+ lr=hparams.initial_learning_rate)
382
+
383
+ if args.checkpoint_path is not None:
384
+ load_checkpoint(args.checkpoint_path, model, optimizer, reset_optimizer=False)
385
+
386
+ load_checkpoint(args.syncnet_checkpoint_path, syncnet, None, reset_optimizer=True, overwrite_global_states=False)
387
+
388
+ if not os.path.exists(checkpoint_dir):
389
+ os.mkdir(checkpoint_dir)
390
+
391
+ # Train!
392
+ train(device, model, train_data_loader, test_data_loader, optimizer,
393
+ checkpoint_dir=checkpoint_dir,
394
+ checkpoint_interval=hparams.checkpoint_interval,
395
+ nepochs=hparams.nepochs)
888.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import cv2
2
+ a=cv2.imread('0.jpg')
3
+ cv2.imwrite('aaa.jpg',a[50:,20:])
9999.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 下载:https://www.tingclass.net/down-5059-1135-1.html
2
+
3
+
4
+
5
+ import requests
6
+ for i in range(1,25):
7
+ for j in range(1,3):
8
+ tmp=str(j)+'_'+str(i)
9
+ url=f'https://online1.tingclass.net/lesson/shi0529/0000/59/{tmp}.mp3'
10
+ # myfile=requests.get(url)
11
+ import wget
12
+ wget.download(url)
13
+
14
+ url=f'https://down11.tingclass.net/textrar/lesson/0000/59/{tmp}.lrc'
15
+ wget.download(url)
README.md ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **Wav2Lip**: *Accurately Lip-syncing Videos In The Wild*
2
+
3
+ For commercial requests, please contact us at [email protected] or [email protected]. We have an HD model ready that can be used commercially.
4
+
5
+ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild_ published at ACM Multimedia 2020.
6
+
7
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrs2)](https://paperswithcode.com/sota/lip-sync-on-lrs2?p=a-lip-sync-expert-is-all-you-need-for-speech)
8
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrs3)](https://paperswithcode.com/sota/lip-sync-on-lrs3?p=a-lip-sync-expert-is-all-you-need-for-speech)
9
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrw)](https://paperswithcode.com/sota/lip-sync-on-lrw?p=a-lip-sync-expert-is-all-you-need-for-speech)
10
+
11
+ |📑 Original Paper|📰 Project Page|🌀 Demo|⚡ Live Testing|📔 Colab Notebook
12
+ |:-:|:-:|:-:|:-:|:-:|
13
+ [Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH)
14
+
15
+ <img src="https://drive.google.com/uc?export=view&id=1Wn0hPmpo4GRbCIJR8Tf20Akzdi1qjjG9"/>
16
+
17
+ ----------
18
+ **Highlights**
19
+ ----------
20
+ - Weights of the visual quality disc has been updated in readme!
21
+ - Lip-sync videos to any target speech with high accuracy :100:. Try our [interactive demo](https://bhaasha.iiit.ac.in/lipsync).
22
+ - :sparkles: Works for any identity, voice, and language. Also works for CGI faces and synthetic voices.
23
+ - Complete training code, inference code, and pretrained models are available :boom:
24
+ - Or, quick-start with the Google Colab Notebook: [Link](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing). Checkpoints and samples are available in a Google Drive [folder](https://drive.google.com/drive/folders/1I-0dNLfFOSFwrfqjNa-SXuwaURHE5K4k?usp=sharing) as well. There is also a [tutorial video](https://www.youtube.com/watch?v=Ic0TBhfuOrA) on this, courtesy of [What Make Art](https://www.youtube.com/channel/UCmGXH-jy0o2CuhqtpxbaQgA). Also, thanks to [Eyal Gruss](https://eyalgruss.com), there is a more accessible [Google Colab notebook](https://j.mp/wav2lip) with more useful features. A tutorial collab notebook is present at this [link](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH).
25
+ - :fire: :fire: Several new, reliable evaluation benchmarks and metrics [[`evaluation/` folder of this repo]](https://github.com/Rudrabha/Wav2Lip/tree/master/evaluation) released. Instructions to calculate the metrics reported in the paper are also present.
26
+
27
+ --------
28
+ **Disclaimer**
29
+ --------
30
+ All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibhited. For commercial requests please contact us directly!
31
+
32
+ Prerequisites
33
+ -------------
34
+ - `Python 3.6`
35
+ - ffmpeg: `sudo apt-get install ffmpeg`
36
+ - Install necessary packages using `pip install -r requirements.txt`. Alternatively, instructions for using a docker image is provided [here](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668). Have a look at [this comment](https://github.com/Rudrabha/Wav2Lip/issues/131#issuecomment-725478562) and comment on [the gist](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668) if you encounter any issues.
37
+ - Face detection [pre-trained model](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) should be downloaded to `face_detection/detection/sfd/s3fd.pth`. Alternative [link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8) if the above does not work.
38
+
39
+ Getting the weights
40
+ ----------
41
+ | Model | Description | Link to the model |
42
+ | :-------------: | :---------------: | :---------------: |
43
+ | Wav2Lip | Highly accurate lip-sync | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?e=TBFBVW) |
44
+ | Wav2Lip + GAN | Slightly inferior lip-sync, but better visual quality | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA?e=n9ljGW) |
45
+ | Expert Discriminator | Weights of the expert discriminator | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EQRvmiZg-HRAjvI6zqN9eTEBP74KefynCwPWVmF57l-AYA?e=ZRPHKP) |
46
+ | Visual Quality Discriminator | Weights of the visual disc trained in a GAN setup | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EQVqH88dTm1HjlK11eNba5gBbn15WMS0B0EZbDBttqrqkg?e=ic0ljo) |
47
+
48
+ Lip-syncing videos using the pre-trained models (Inference)
49
+ -------
50
+ You can lip-sync any video to any audio:
51
+ ```bash
52
+ python inference.py --checkpoint_path <ckpt> --face <video.mp4> --audio <an-audio-source>
53
+ ```
54
+ The result is saved (by default) in `results/result_voice.mp4`. You can specify it as an argument, similar to several other available options. The audio source can be any file supported by `FFMPEG` containing audio data: `*.wav`, `*.mp3` or even a video file, from which the code will automatically extract the audio.
55
+
56
+ ##### Tips for better results:
57
+ - Experiment with the `--pads` argument to adjust the detected face bounding box. Often leads to improved results. You might need to increase the bottom padding to include the chin region. E.g. `--pads 0 20 0 0`.
58
+ - If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give another try.
59
+ - Experiment with the `--resize_factor` argument, to get a lower resolution video. Why? The models are trained on faces which were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too).
60
+ - The Wav2Lip model without GAN usually needs more experimenting with the above two to get the most ideal results, and sometimes, can give you a better result as well.
61
+
62
+ Preparing LRS2 for training
63
+ ----------
64
+ Our models are trained on LRS2. See [here](#training-on-datasets-other-than-lrs2) for a few suggestions regarding training on other datasets.
65
+ ##### LRS2 dataset folder structure
66
+
67
+ ```
68
+ data_root (mvlrs_v1)
69
+ ├── main, pretrain (we use only main folder in this work)
70
+ | ├── list of folders
71
+ | │ ├── five-digit numbered video IDs ending with (.mp4)
72
+ ```
73
+
74
+ Place the LRS2 filelists (train, val, test) `.txt` files in the `filelists/` folder.
75
+
76
+ ##### Preprocess the dataset for fast training
77
+
78
+ ```bash
79
+ python preprocess.py --data_root data_root/main --preprocessed_root lrs2_preprocessed/
80
+ ```
81
+ Additional options like `batch_size` and number of GPUs to use in parallel to use can also be set.
82
+
83
+ ##### Preprocessed LRS2 folder structure
84
+ ```
85
+ preprocessed_root (lrs2_preprocessed)
86
+ ├── list of folders
87
+ | ├── Folders with five-digit numbered video IDs
88
+ | │ ├── *.jpg
89
+ | │ ├── audio.wav
90
+ ```
91
+
92
+ Train!
93
+ ----------
94
+ There are two major steps: (i) Train the expert lip-sync discriminator, (ii) Train the Wav2Lip model(s).
95
+
96
+ ##### Training the expert discriminator
97
+ You can download [the pre-trained weights](#getting-the-weights) if you want to skip this step. To train it:
98
+ ```bash
99
+ python color_syncnet_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints>
100
+ ```
101
+ ##### Training the Wav2Lip models
102
+ You can either train the model without the additional visual quality disriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run:
103
+ ```bash
104
+ python wav2lip_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints> --syncnet_checkpoint_path <path_to_expert_disc_checkpoint>
105
+ ```
106
+
107
+ To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both the files are similar. In both the cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
108
+
109
+ Training on datasets other than LRS2
110
+ ------------------------------------
111
+ Training on other datasets might require modifications to the code. Please read the following before you raise an issue:
112
+
113
+ - You might not get good results by training/fine-tuning on a few minutes of a single speaker. This is a separate research problem, to which we do not have a solution yet. Thus, we would most likely not be able to resolve your issue.
114
+ - You must train the expert discriminator for your own dataset before training Wav2Lip.
115
+ - If it is your own dataset downloaded from the web, in most cases, needs to be sync-corrected.
116
+ - Be mindful of the FPS of the videos of your dataset. Changes to FPS would need significant code changes.
117
+ - The expert discriminator's eval loss should go down to ~0.25 and the Wav2Lip eval sync loss should go down to ~0.2 to get good results.
118
+
119
+ When raising an issue on this topic, please let us know that you are aware of all these points.
120
+
121
+ We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model.
122
+
123
+ Evaluation
124
+ ----------
125
+ Please check the `evaluation/` folder for the instructions.
126
+
127
+ License and Citation
128
+ ----------
129
+ Theis repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at [email protected] or [email protected]. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
130
+ ```
131
+ @inproceedings{10.1145/3394171.3413532,
132
+ author = {Prajwal, K R and Mukhopadhyay, Rudrabha and Namboodiri, Vinay P. and Jawahar, C.V.},
133
+ title = {A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild},
134
+ year = {2020},
135
+ isbn = {9781450379885},
136
+ publisher = {Association for Computing Machinery},
137
+ address = {New York, NY, USA},
138
+ url = {https://doi.org/10.1145/3394171.3413532},
139
+ doi = {10.1145/3394171.3413532},
140
+ booktitle = {Proceedings of the 28th ACM International Conference on Multimedia},
141
+ pages = {484–492},
142
+ numpages = {9},
143
+ keywords = {lip sync, talking face generation, video generation},
144
+ location = {Seattle, WA, USA},
145
+ series = {MM '20}
146
+ }
147
+ ```
148
+
149
+
150
+ Acknowledgements
151
+ ----------
152
+ Parts of the code structure is inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
__pycache__/audio.cpython-39.pyc ADDED
Binary file (4.56 kB). View file
 
__pycache__/hparams.cpython-39.pyc ADDED
Binary file (2.4 kB). View file
 
aaa.jpg ADDED
audio.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import librosa.filters
3
+ import numpy as np
4
+ # import tensorflow as tf
5
+ from scipy import signal
6
+ from scipy.io import wavfile
7
+ from hparams import hparams as hp
8
+
9
+ def load_wav(path, sr):
10
+ return librosa.core.load(path, sr=sr)[0]
11
+
12
+ def save_wav(wav, path, sr):
13
+ wav *= 32767 / max(0.01, np.max(np.abs(wav)))
14
+ #proposed by @dsmiller
15
+ wavfile.write(path, sr, wav.astype(np.int16))
16
+
17
+ def save_wavenet_wav(wav, path, sr):
18
+ librosa.output.write_wav(path, wav, sr=sr)
19
+
20
+ def preemphasis(wav, k, preemphasize=True):
21
+ if preemphasize:
22
+ return signal.lfilter([1, -k], [1], wav)
23
+ return wav
24
+
25
+ def inv_preemphasis(wav, k, inv_preemphasize=True):
26
+ if inv_preemphasize:
27
+ return signal.lfilter([1], [1, -k], wav)
28
+ return wav
29
+
30
+ def get_hop_size():
31
+ hop_size = hp.hop_size
32
+ if hop_size is None:
33
+ assert hp.frame_shift_ms is not None
34
+ hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
35
+ return hop_size
36
+
37
+ def linearspectrogram(wav):
38
+ D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
39
+ S = _amp_to_db(np.abs(D)) - hp.ref_level_db
40
+
41
+ if hp.signal_normalization:
42
+ return _normalize(S)
43
+ return S
44
+
45
+ def melspectrogram(wav):
46
+ D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
47
+ S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
48
+
49
+ if hp.signal_normalization:
50
+ return _normalize(S)
51
+ return S
52
+
53
+ def _lws_processor():
54
+ import lws
55
+ return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
56
+
57
+ def _stft(y):
58
+ if hp.use_lws:
59
+ return _lws_processor(hp).stft(y).T
60
+ else:
61
+ return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
62
+
63
+ ##########################################################
64
+ #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
65
+ def num_frames(length, fsize, fshift):
66
+ """Compute number of time frames of spectrogram
67
+ """
68
+ pad = (fsize - fshift)
69
+ if length % fshift == 0:
70
+ M = (length + pad * 2 - fsize) // fshift + 1
71
+ else:
72
+ M = (length + pad * 2 - fsize) // fshift + 2
73
+ return M
74
+
75
+
76
+ def pad_lr(x, fsize, fshift):
77
+ """Compute left and right padding
78
+ """
79
+ M = num_frames(len(x), fsize, fshift)
80
+ pad = (fsize - fshift)
81
+ T = len(x) + 2 * pad
82
+ r = (M - 1) * fshift + fsize - T
83
+ return pad, pad + r
84
+ ##########################################################
85
+ #Librosa correct padding
86
+ def librosa_pad_lr(x, fsize, fshift):
87
+ return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
88
+
89
+ # Conversions
90
+ _mel_basis = None
91
+
92
+ def _linear_to_mel(spectogram):
93
+ global _mel_basis
94
+ if _mel_basis is None:
95
+ _mel_basis = _build_mel_basis()
96
+ return np.dot(_mel_basis, spectogram)
97
+
98
+ def _build_mel_basis():
99
+ assert hp.fmax <= hp.sample_rate // 2
100
+ return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels,
101
+ fmin=hp.fmin, fmax=hp.fmax)
102
+ def _amp_to_db(x):
103
+ min_level = np.exp(hp.min_level_db / 20 * np.log(10))
104
+ return 20 * np.log10(np.maximum(min_level, x))
105
+
106
+ def _db_to_amp(x):
107
+ return np.power(10.0, (x) * 0.05)
108
+
109
+ def _normalize(S):
110
+ if hp.allow_clipping_in_normalization:
111
+ if hp.symmetric_mels:
112
+ return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
113
+ -hp.max_abs_value, hp.max_abs_value)
114
+ else:
115
+ return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
116
+
117
+ assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
118
+ if hp.symmetric_mels:
119
+ return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
120
+ else:
121
+ return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
122
+
123
+ def _denormalize(D):
124
+ if hp.allow_clipping_in_normalization:
125
+ if hp.symmetric_mels:
126
+ return (((np.clip(D, -hp.max_abs_value,
127
+ hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
128
+ + hp.min_level_db)
129
+ else:
130
+ return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
131
+
132
+ if hp.symmetric_mels:
133
+ return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
134
+ else:
135
+ return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
checkpoints/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Place all your checkpoints (.pth files) here.
checkpoints/lipsync_expert.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b9936c721696446eeed353032cab242a8cf0eed8c46cde540366f6ae5493be5
3
+ size 197357631
checkpoints/visual_quality_disc.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f8f6f7e954af02f2ffe0f3ea11f3259af89bff6e70933001c7c6bc8c145d96
3
+ size 169382040
checkpoints/wav2lip.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c6630ca96681c289e43d2ed15e18e5bca968abdea8abb0bf6617c085a9fc52
3
+ size 166953
checkpoints/wav2lip_gan.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca9ab7b7b812c0e80a6e70a5977c545a1e8a365a6c49d5e533023c034d7ac3d8
3
+ size 435801865
color_syncnet_train.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os.path import dirname, join, basename, isfile
2
+ from tqdm import tqdm
3
+
4
+ from models import SyncNet_color as SyncNet
5
+ import audio
6
+
7
+ import torch
8
+ from torch import nn
9
+ from torch import optim
10
+ import torch.backends.cudnn as cudnn
11
+ from torch.utils import data as data_utils
12
+ import numpy as np
13
+
14
+ from glob import glob
15
+
16
+ import os, random, cv2, argparse
17
+ from hparams import hparams, get_image_list
18
+
19
+ parser = argparse.ArgumentParser(description='Code to train the expert lip-sync discriminator')
20
+
21
+ parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=True)
22
+
23
+ parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=True, type=str)
24
+ parser.add_argument('--checkpoint_path', help='Resumed from this checkpoint', default=None, type=str)
25
+
26
+ args = parser.parse_args()
27
+
28
+
29
+ global_step = 0
30
+ global_epoch = 0
31
+ use_cuda = torch.cuda.is_available()
32
+ print('use_cuda: {}'.format(use_cuda))
33
+
34
+ syncnet_T = 5
35
+ syncnet_mel_step_size = 16
36
+
37
+ class Dataset(object):
38
+ def __init__(self, split):
39
+ self.all_videos = get_image_list(args.data_root, split)
40
+
41
+ def get_frame_id(self, frame):
42
+ return int(basename(frame).split('.')[0])
43
+
44
+ def get_window(self, start_frame):
45
+ start_id = self.get_frame_id(start_frame)
46
+ vidname = dirname(start_frame)
47
+
48
+ window_fnames = []
49
+ for frame_id in range(start_id, start_id + syncnet_T):
50
+ frame = join(vidname, '{}.jpg'.format(frame_id))
51
+ if not isfile(frame):
52
+ return None
53
+ window_fnames.append(frame)
54
+ return window_fnames
55
+
56
+ def crop_audio_window(self, spec, start_frame):
57
+ # num_frames = (T x hop_size * fps) / sample_rate
58
+ start_frame_num = self.get_frame_id(start_frame)
59
+ start_idx = int(80. * (start_frame_num / float(hparams.fps)))
60
+
61
+ end_idx = start_idx + syncnet_mel_step_size
62
+
63
+ return spec[start_idx : end_idx, :]
64
+
65
+
66
+ def __len__(self):
67
+ return len(self.all_videos)
68
+
69
+ def __getitem__(self, idx):
70
+ while 1:
71
+ idx = random.randint(0, len(self.all_videos) - 1)
72
+ vidname = self.all_videos[idx]
73
+
74
+ img_names = list(glob(join(vidname, '*.jpg')))
75
+ if len(img_names) <= 3 * syncnet_T:
76
+ continue
77
+ img_name = random.choice(img_names)
78
+ wrong_img_name = random.choice(img_names)
79
+ while wrong_img_name == img_name:
80
+ wrong_img_name = random.choice(img_names)
81
+
82
+ if random.choice([True, False]):
83
+ y = torch.ones(1).float()
84
+ chosen = img_name
85
+ else:
86
+ y = torch.zeros(1).float()
87
+ chosen = wrong_img_name
88
+
89
+ window_fnames = self.get_window(chosen)
90
+ if window_fnames is None:
91
+ continue
92
+
93
+ window = []
94
+ all_read = True
95
+ for fname in window_fnames:
96
+ img = cv2.imread(fname)
97
+ if img is None:
98
+ all_read = False
99
+ break
100
+ try:
101
+ img = cv2.resize(img, (hparams.img_size, hparams.img_size))
102
+ except Exception as e:
103
+ all_read = False
104
+ break
105
+
106
+ window.append(img)
107
+
108
+ if not all_read: continue
109
+
110
+ try:
111
+ wavpath = join(vidname, "audio.wav")
112
+ wav = audio.load_wav(wavpath, hparams.sample_rate)
113
+
114
+ orig_mel = audio.melspectrogram(wav).T
115
+ except Exception as e:
116
+ continue
117
+
118
+ mel = self.crop_audio_window(orig_mel.copy(), img_name)
119
+
120
+ if (mel.shape[0] != syncnet_mel_step_size):
121
+ continue
122
+
123
+ # H x W x 3 * T
124
+ x = np.concatenate(window, axis=2) / 255.
125
+ x = x.transpose(2, 0, 1)
126
+ x = x[:, x.shape[1]//2:]
127
+
128
+ x = torch.FloatTensor(x)
129
+ mel = torch.FloatTensor(mel.T).unsqueeze(0)
130
+
131
+ return x, mel, y
132
+
133
+ logloss = nn.BCELoss()
134
+ def cosine_loss(a, v, y):
135
+ d = nn.functional.cosine_similarity(a, v)
136
+ loss = logloss(d.unsqueeze(1), y)
137
+
138
+ return loss
139
+
140
+ def train(device, model, train_data_loader, test_data_loader, optimizer,
141
+ checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
142
+
143
+ global global_step, global_epoch
144
+ resumed_step = global_step
145
+
146
+ while global_epoch < nepochs:
147
+ running_loss = 0.
148
+ prog_bar = tqdm(enumerate(train_data_loader))
149
+ for step, (x, mel, y) in prog_bar:
150
+ model.train()
151
+ optimizer.zero_grad()
152
+
153
+ # Transform data to CUDA device
154
+ x = x.to(device)
155
+
156
+ mel = mel.to(device)
157
+
158
+ a, v = model(mel, x)
159
+ y = y.to(device)
160
+
161
+ loss = cosine_loss(a, v, y)
162
+ loss.backward()
163
+ optimizer.step()
164
+
165
+ global_step += 1
166
+ cur_session_steps = global_step - resumed_step
167
+ running_loss += loss.item()
168
+
169
+ if global_step == 1 or global_step % checkpoint_interval == 0:
170
+ save_checkpoint(
171
+ model, optimizer, global_step, checkpoint_dir, global_epoch)
172
+
173
+ if global_step % hparams.syncnet_eval_interval == 0:
174
+ with torch.no_grad():
175
+ eval_model(test_data_loader, global_step, device, model, checkpoint_dir)
176
+
177
+ prog_bar.set_description('Loss: {}'.format(running_loss / (step + 1)))
178
+
179
+ global_epoch += 1
180
+
181
+ def eval_model(test_data_loader, global_step, device, model, checkpoint_dir):
182
+ eval_steps = 1400
183
+ print('Evaluating for {} steps'.format(eval_steps))
184
+ losses = []
185
+ while 1:
186
+ for step, (x, mel, y) in enumerate(test_data_loader):
187
+
188
+ model.eval()
189
+
190
+ # Transform data to CUDA device
191
+ x = x.to(device)
192
+
193
+ mel = mel.to(device)
194
+
195
+ a, v = model(mel, x)
196
+ y = y.to(device)
197
+
198
+ loss = cosine_loss(a, v, y)
199
+ losses.append(loss.item())
200
+
201
+ if step > eval_steps: break
202
+
203
+ averaged_loss = sum(losses) / len(losses)
204
+ print(averaged_loss)
205
+
206
+ return
207
+
208
+ def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch):
209
+
210
+ checkpoint_path = join(
211
+ checkpoint_dir, "checkpoint_step{:09d}.pth".format(global_step))
212
+ optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
213
+ torch.save({
214
+ "state_dict": model.state_dict(),
215
+ "optimizer": optimizer_state,
216
+ "global_step": step,
217
+ "global_epoch": epoch,
218
+ }, checkpoint_path)
219
+ print("Saved checkpoint:", checkpoint_path)
220
+
221
+ def _load(checkpoint_path):
222
+ if use_cuda:
223
+ checkpoint = torch.load(checkpoint_path)
224
+ else:
225
+ checkpoint = torch.load(checkpoint_path,
226
+ map_location=lambda storage, loc: storage)
227
+ return checkpoint
228
+
229
+ def load_checkpoint(path, model, optimizer, reset_optimizer=False):
230
+ global global_step
231
+ global global_epoch
232
+
233
+ print("Load checkpoint from: {}".format(path))
234
+ checkpoint = _load(path)
235
+ model.load_state_dict(checkpoint["state_dict"])
236
+ if not reset_optimizer:
237
+ optimizer_state = checkpoint["optimizer"]
238
+ if optimizer_state is not None:
239
+ print("Load optimizer state from {}".format(path))
240
+ optimizer.load_state_dict(checkpoint["optimizer"])
241
+ global_step = checkpoint["global_step"]
242
+ global_epoch = checkpoint["global_epoch"]
243
+
244
+ return model
245
+
246
+ if __name__ == "__main__":
247
+ checkpoint_dir = args.checkpoint_dir
248
+ checkpoint_path = args.checkpoint_path
249
+
250
+ if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir)
251
+
252
+ # Dataset and Dataloader setup
253
+ train_dataset = Dataset('train')
254
+ test_dataset = Dataset('val')
255
+
256
+ train_data_loader = data_utils.DataLoader(
257
+ train_dataset, batch_size=hparams.syncnet_batch_size, shuffle=True,
258
+ num_workers=hparams.num_workers)
259
+
260
+ test_data_loader = data_utils.DataLoader(
261
+ test_dataset, batch_size=hparams.syncnet_batch_size,
262
+ num_workers=8)
263
+
264
+ device = torch.device("cuda" if use_cuda else "cpu")
265
+
266
+ # Model
267
+ model = SyncNet().to(device)
268
+ print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
269
+
270
+ optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
271
+ lr=hparams.syncnet_lr)
272
+
273
+ if checkpoint_path is not None:
274
+ load_checkpoint(checkpoint_path, model, optimizer, reset_optimizer=False)
275
+
276
+ train(device, model, train_data_loader, test_data_loader, optimizer,
277
+ checkpoint_dir=checkpoint_dir,
278
+ checkpoint_interval=hparams.syncnet_checkpoint_interval,
279
+ nepochs=hparams.nepochs)
evaluation/README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Novel Evaluation Framework, new filelists, and using the LSE-D and LSE-C metric.
2
+
3
+ Our paper also proposes a novel evaluation framework (Section 4). To evaluate on LRS2, LRS3, and LRW, the filelists are present in the `test_filelists` folder. Please use `gen_videos_from_filelist.py` script to generate the videos. After that, you can calculate the LSE-D and LSE-C scores using the instructions below. Please see [this thread](https://github.com/Rudrabha/Wav2Lip/issues/22#issuecomment-712825380) on how to calculate the FID scores.
4
+
5
+ The videos of the ReSyncED benchmark for real-world evaluation will be released soon.
6
+
7
+ ### Steps to set-up the evaluation repository for LSE-D and LSE-C metric:
8
+ We use the pre-trained syncnet model available in this [repository](https://github.com/joonson/syncnet_python).
9
+
10
+ * Clone the SyncNet repository.
11
+ ```
12
+ git clone https://github.com/joonson/syncnet_python.git
13
+ ```
14
+ * Follow the procedure given in the above linked [repository](https://github.com/joonson/syncnet_python) to download the pretrained models and set up the dependencies.
15
+ * **Note: Please install a separate virtual environment for the evaluation scripts. The versions used by Wav2Lip and the publicly released code of SyncNet is different and can cause version mis-match issues. To avoid this, we suggest the users to install a separate virtual environment for the evaluation scripts**
16
+ ```
17
+ cd syncnet_python
18
+ pip install -r requirements.txt
19
+ sh download_model.sh
20
+ ```
21
+ * The above step should ensure that all the dependencies required by the repository is installed and the pre-trained models are downloaded.
22
+
23
+ ### Running the evaluation scripts:
24
+ * Copy our evaluation scripts given in this folder to the cloned repository.
25
+ ```
26
+ cd Wav2Lip/evaluation/scores_LSE/
27
+ cp *.py syncnet_python/
28
+ cp *.sh syncnet_python/
29
+ ```
30
+ **Note: We will release the test filelists for LRW, LRS2 and LRS3 shortly once we receive permission from the dataset creators. We will also release the Real World Dataset we have collected shortly.**
31
+
32
+ * Our evaluation technique does not require ground-truth of any sorts. Given lip-synced videos we can directly calculate the scores from only the generated videos. Please store the generated videos (from our test sets or your own generated videos) in the following folder structure.
33
+ ```
34
+ video data root (Folder containing all videos)
35
+ ├── All .mp4 files
36
+ ```
37
+ * Change the folder back to the cloned repository.
38
+ ```
39
+ cd syncnet_python
40
+ ```
41
+ * To run evaluation on the LRW, LRS2 and LRS3 test files, please run the following command:
42
+ ```
43
+ python calculate_scores_LRS.py --data_root /path/to/video/data/root --tmp_dir tmp_dir/
44
+ ```
45
+
46
+ * To run evaluation on the ReSynced dataset or your own generated videos, please run the following command:
47
+ ```
48
+ sh calculate_scores_real_videos.sh /path/to/video/data/root
49
+ ```
50
+ * The generated scores will be present in the all_scores.txt generated in the ```syncnet_python/``` folder
51
+
52
+ # Evaluation of image quality using FID metric.
53
+ We use the [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository for calculating the FID metrics. We dump all the frames in both ground-truth and generated videos and calculate the FID score.
54
+
55
+
56
+ # Opening issues related to evaluation scripts
57
+ * Please open the issues with the "Evaluation" label if you face any issues in the evaluation scripts.
58
+
59
+ # Acknowledgements
60
+ Our evaluation pipeline in based on two existing repositories. LSE metrics are based on the [syncnet_python](https://github.com/joonson/syncnet_python) repository and the FID score is based on [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository. We thank the authors of both the repositories for releasing their wonderful code.
61
+
62
+
63
+
evaluation/gen_videos_from_filelist.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import listdir, path
2
+ import numpy as np
3
+ import scipy, cv2, os, sys, argparse
4
+ import dlib, json, subprocess
5
+ from tqdm import tqdm
6
+ from glob import glob
7
+ import torch
8
+
9
+ sys.path.append('../')
10
+ import audio
11
+ import face_detection
12
+ from models import Wav2Lip
13
+
14
+ parser = argparse.ArgumentParser(description='Code to generate results for test filelists')
15
+
16
+ parser.add_argument('--filelist', type=str,
17
+ help='Filepath of filelist file to read', required=True)
18
+ parser.add_argument('--results_dir', type=str, help='Folder to save all results into',
19
+ required=True)
20
+ parser.add_argument('--data_root', type=str, required=True)
21
+ parser.add_argument('--checkpoint_path', type=str,
22
+ help='Name of saved checkpoint to load weights from', required=True)
23
+
24
+ parser.add_argument('--pads', nargs='+', type=int, default=[0, 0, 0, 0],
25
+ help='Padding (top, bottom, left, right)')
26
+ parser.add_argument('--face_det_batch_size', type=int,
27
+ help='Single GPU batch size for face detection', default=64)
28
+ parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
29
+
30
+ # parser.add_argument('--resize_factor', default=1, type=int)
31
+
32
+ args = parser.parse_args()
33
+ args.img_size = 96
34
+
35
+ def get_smoothened_boxes(boxes, T):
36
+ for i in range(len(boxes)):
37
+ if i + T > len(boxes):
38
+ window = boxes[len(boxes) - T:]
39
+ else:
40
+ window = boxes[i : i + T]
41
+ boxes[i] = np.mean(window, axis=0)
42
+ return boxes
43
+
44
+ def face_detect(images):
45
+ batch_size = args.face_det_batch_size
46
+
47
+ while 1:
48
+ predictions = []
49
+ try:
50
+ for i in range(0, len(images), batch_size):
51
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
52
+ except RuntimeError:
53
+ if batch_size == 1:
54
+ raise RuntimeError('Image too big to run face detection on GPU')
55
+ batch_size //= 2
56
+ args.face_det_batch_size = batch_size
57
+ print('Recovering from OOM error; New batch size: {}'.format(batch_size))
58
+ continue
59
+ break
60
+
61
+ results = []
62
+ pady1, pady2, padx1, padx2 = args.pads
63
+ for rect, image in zip(predictions, images):
64
+ if rect is None:
65
+ raise ValueError('Face not detected!')
66
+
67
+ y1 = max(0, rect[1] - pady1)
68
+ y2 = min(image.shape[0], rect[3] + pady2)
69
+ x1 = max(0, rect[0] - padx1)
70
+ x2 = min(image.shape[1], rect[2] + padx2)
71
+
72
+ results.append([x1, y1, x2, y2])
73
+
74
+ boxes = get_smoothened_boxes(np.array(results), T=5)
75
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
76
+
77
+ return results
78
+
79
+ def datagen(frames, face_det_results, mels):
80
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
81
+
82
+ for i, m in enumerate(mels):
83
+ if i >= len(frames): raise ValueError('Equal or less lengths only')
84
+
85
+ frame_to_save = frames[i].copy()
86
+ face, coords, valid_frame = face_det_results[i].copy()
87
+ if not valid_frame:
88
+ continue
89
+
90
+ face = cv2.resize(face, (args.img_size, args.img_size))
91
+
92
+ img_batch.append(face)
93
+ mel_batch.append(m)
94
+ frame_batch.append(frame_to_save)
95
+ coords_batch.append(coords)
96
+
97
+ if len(img_batch) >= args.wav2lip_batch_size:
98
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
99
+
100
+ img_masked = img_batch.copy()
101
+ img_masked[:, args.img_size//2:] = 0
102
+
103
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
104
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
105
+
106
+ yield img_batch, mel_batch, frame_batch, coords_batch
107
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
108
+
109
+ if len(img_batch) > 0:
110
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
111
+
112
+ img_masked = img_batch.copy()
113
+ img_masked[:, args.img_size//2:] = 0
114
+
115
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
116
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
117
+
118
+ yield img_batch, mel_batch, frame_batch, coords_batch
119
+
120
+ fps = 25
121
+ mel_step_size = 16
122
+ mel_idx_multiplier = 80./fps
123
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
124
+ print('Using {} for inference.'.format(device))
125
+
126
+ detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
127
+ flip_input=False, device=device)
128
+
129
+ def _load(checkpoint_path):
130
+ if device == 'cuda':
131
+ checkpoint = torch.load(checkpoint_path)
132
+ else:
133
+ checkpoint = torch.load(checkpoint_path,
134
+ map_location=lambda storage, loc: storage)
135
+ return checkpoint
136
+
137
+ def load_model(path):
138
+ model = Wav2Lip()
139
+ print("Load checkpoint from: {}".format(path))
140
+ checkpoint = _load(path)
141
+ s = checkpoint["state_dict"]
142
+ new_s = {}
143
+ for k, v in s.items():
144
+ new_s[k.replace('module.', '')] = v
145
+ model.load_state_dict(new_s)
146
+
147
+ model = model.to(device)
148
+ return model.eval()
149
+
150
+ model = load_model(args.checkpoint_path)
151
+
152
+ def main():
153
+ assert args.data_root is not None
154
+ data_root = args.data_root
155
+
156
+ if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
157
+
158
+ with open(args.filelist, 'r') as filelist:
159
+ lines = filelist.readlines()
160
+
161
+ for idx, line in enumerate(tqdm(lines)):
162
+ audio_src, video = line.strip().split()
163
+
164
+ audio_src = os.path.join(data_root, audio_src) + '.mp4'
165
+ video = os.path.join(data_root, video) + '.mp4'
166
+
167
+ command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
168
+ subprocess.call(command, shell=True)
169
+ temp_audio = '../temp/temp.wav'
170
+
171
+ wav = audio.load_wav(temp_audio, 16000)
172
+ mel = audio.melspectrogram(wav)
173
+ if np.isnan(mel.reshape(-1)).sum() > 0:
174
+ continue
175
+
176
+ mel_chunks = []
177
+ i = 0
178
+ while 1:
179
+ start_idx = int(i * mel_idx_multiplier)
180
+ if start_idx + mel_step_size > len(mel[0]):
181
+ break
182
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
183
+ i += 1
184
+
185
+ video_stream = cv2.VideoCapture(video)
186
+
187
+ full_frames = []
188
+ while 1:
189
+ still_reading, frame = video_stream.read()
190
+ if not still_reading or len(full_frames) > len(mel_chunks):
191
+ video_stream.release()
192
+ break
193
+ full_frames.append(frame)
194
+
195
+ if len(full_frames) < len(mel_chunks):
196
+ continue
197
+
198
+ full_frames = full_frames[:len(mel_chunks)]
199
+
200
+ try:
201
+ face_det_results = face_detect(full_frames.copy())
202
+ except ValueError as e:
203
+ continue
204
+
205
+ batch_size = args.wav2lip_batch_size
206
+ gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
207
+
208
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
209
+ if i == 0:
210
+ frame_h, frame_w = full_frames[0].shape[:-1]
211
+ out = cv2.VideoWriter('../temp/result.avi',
212
+ cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
213
+
214
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
215
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
216
+
217
+ with torch.no_grad():
218
+ pred = model(mel_batch, img_batch)
219
+
220
+
221
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
222
+
223
+ for pl, f, c in zip(pred, frames, coords):
224
+ y1, y2, x1, x2 = c
225
+ pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
226
+ f[y1:y2, x1:x2] = pl
227
+ out.write(f)
228
+
229
+ out.release()
230
+
231
+ vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
232
+
233
+ command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format(temp_audio,
234
+ '../temp/result.avi', vid)
235
+ subprocess.call(command, shell=True)
236
+
237
+ if __name__ == '__main__':
238
+ main()
evaluation/real_videos_inference.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import listdir, path
2
+ import numpy as np
3
+ import scipy, cv2, os, sys, argparse
4
+ import dlib, json, subprocess
5
+ from tqdm import tqdm
6
+ from glob import glob
7
+ import torch
8
+
9
+ sys.path.append('../')
10
+ import audio
11
+ import face_detection
12
+ from models import Wav2Lip
13
+
14
+ parser = argparse.ArgumentParser(description='Code to generate results on ReSyncED evaluation set')
15
+
16
+ parser.add_argument('--mode', type=str,
17
+ help='random | dubbed | tts', required=True)
18
+
19
+ parser.add_argument('--filelist', type=str,
20
+ help='Filepath of filelist file to read', default=None)
21
+
22
+ parser.add_argument('--results_dir', type=str, help='Folder to save all results into',
23
+ required=True)
24
+ parser.add_argument('--data_root', type=str, required=True)
25
+ parser.add_argument('--checkpoint_path', type=str,
26
+ help='Name of saved checkpoint to load weights from', required=True)
27
+ parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0],
28
+ help='Padding (top, bottom, left, right)')
29
+
30
+ parser.add_argument('--face_det_batch_size', type=int,
31
+ help='Single GPU batch size for face detection', default=16)
32
+
33
+ parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
34
+ parser.add_argument('--face_res', help='Approximate resolution of the face at which to test', default=180)
35
+ parser.add_argument('--min_frame_res', help='Do not downsample further below this frame resolution', default=480)
36
+ parser.add_argument('--max_frame_res', help='Downsample to at least this frame resolution', default=720)
37
+ # parser.add_argument('--resize_factor', default=1, type=int)
38
+
39
+ args = parser.parse_args()
40
+ args.img_size = 96
41
+
42
+ def get_smoothened_boxes(boxes, T):
43
+ for i in range(len(boxes)):
44
+ if i + T > len(boxes):
45
+ window = boxes[len(boxes) - T:]
46
+ else:
47
+ window = boxes[i : i + T]
48
+ boxes[i] = np.mean(window, axis=0)
49
+ return boxes
50
+
51
+ def rescale_frames(images):
52
+ rect = detector.get_detections_for_batch(np.array([images[0]]))[0]
53
+ if rect is None:
54
+ raise ValueError('Face not detected!')
55
+ h, w = images[0].shape[:-1]
56
+
57
+ x1, y1, x2, y2 = rect
58
+
59
+ face_size = max(np.abs(y1 - y2), np.abs(x1 - x2))
60
+
61
+ diff = np.abs(face_size - args.face_res)
62
+ for factor in range(2, 16):
63
+ downsampled_res = face_size // factor
64
+ if min(h//factor, w//factor) < args.min_frame_res: break
65
+ if np.abs(downsampled_res - args.face_res) >= diff: break
66
+
67
+ factor -= 1
68
+ if factor == 1: return images
69
+
70
+ return [cv2.resize(im, (im.shape[1]//(factor), im.shape[0]//(factor))) for im in images]
71
+
72
+
73
+ def face_detect(images):
74
+ batch_size = args.face_det_batch_size
75
+ images = rescale_frames(images)
76
+
77
+ while 1:
78
+ predictions = []
79
+ try:
80
+ for i in range(0, len(images), batch_size):
81
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
82
+ except RuntimeError:
83
+ if batch_size == 1:
84
+ raise RuntimeError('Image too big to run face detection on GPU')
85
+ batch_size //= 2
86
+ print('Recovering from OOM error; New batch size: {}'.format(batch_size))
87
+ continue
88
+ break
89
+
90
+ results = []
91
+ pady1, pady2, padx1, padx2 = args.pads
92
+ for rect, image in zip(predictions, images):
93
+ if rect is None:
94
+ raise ValueError('Face not detected!')
95
+
96
+ y1 = max(0, rect[1] - pady1)
97
+ y2 = min(image.shape[0], rect[3] + pady2)
98
+ x1 = max(0, rect[0] - padx1)
99
+ x2 = min(image.shape[1], rect[2] + padx2)
100
+
101
+ results.append([x1, y1, x2, y2])
102
+
103
+ boxes = get_smoothened_boxes(np.array(results), T=5)
104
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
105
+
106
+ return results, images
107
+
108
+ def datagen(frames, face_det_results, mels):
109
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
110
+
111
+ for i, m in enumerate(mels):
112
+ if i >= len(frames): raise ValueError('Equal or less lengths only')
113
+
114
+ frame_to_save = frames[i].copy()
115
+ face, coords, valid_frame = face_det_results[i].copy()
116
+ if not valid_frame:
117
+ continue
118
+
119
+ face = cv2.resize(face, (args.img_size, args.img_size))
120
+
121
+ img_batch.append(face)
122
+ mel_batch.append(m)
123
+ frame_batch.append(frame_to_save)
124
+ coords_batch.append(coords)
125
+
126
+ if len(img_batch) >= args.wav2lip_batch_size:
127
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
128
+
129
+ img_masked = img_batch.copy()
130
+ img_masked[:, args.img_size//2:] = 0
131
+
132
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
133
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
134
+
135
+ yield img_batch, mel_batch, frame_batch, coords_batch
136
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
137
+
138
+ if len(img_batch) > 0:
139
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
140
+
141
+ img_masked = img_batch.copy()
142
+ img_masked[:, args.img_size//2:] = 0
143
+
144
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
145
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
146
+
147
+ yield img_batch, mel_batch, frame_batch, coords_batch
148
+
149
+ def increase_frames(frames, l):
150
+ ## evenly duplicating frames to increase length of video
151
+ while len(frames) < l:
152
+ dup_every = float(l) / len(frames)
153
+
154
+ final_frames = []
155
+ next_duplicate = 0.
156
+
157
+ for i, f in enumerate(frames):
158
+ final_frames.append(f)
159
+
160
+ if int(np.ceil(next_duplicate)) == i:
161
+ final_frames.append(f)
162
+
163
+ next_duplicate += dup_every
164
+
165
+ frames = final_frames
166
+
167
+ return frames[:l]
168
+
169
+ mel_step_size = 16
170
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
171
+ print('Using {} for inference.'.format(device))
172
+
173
+ detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
174
+ flip_input=False, device=device)
175
+
176
+ def _load(checkpoint_path):
177
+ if device == 'cuda':
178
+ checkpoint = torch.load(checkpoint_path)
179
+ else:
180
+ checkpoint = torch.load(checkpoint_path,
181
+ map_location=lambda storage, loc: storage)
182
+ return checkpoint
183
+
184
+ def load_model(path):
185
+ model = Wav2Lip()
186
+ print("Load checkpoint from: {}".format(path))
187
+ checkpoint = _load(path)
188
+ s = checkpoint["state_dict"]
189
+ new_s = {}
190
+ for k, v in s.items():
191
+ new_s[k.replace('module.', '')] = v
192
+ model.load_state_dict(new_s)
193
+
194
+ model = model.to(device)
195
+ return model.eval()
196
+
197
+ model = load_model(args.checkpoint_path)
198
+
199
+ def main():
200
+ if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
201
+
202
+ if args.mode == 'dubbed':
203
+ files = listdir(args.data_root)
204
+ lines = ['{} {}'.format(f, f) for f in files]
205
+
206
+ else:
207
+ assert args.filelist is not None
208
+ with open(args.filelist, 'r') as filelist:
209
+ lines = filelist.readlines()
210
+
211
+ for idx, line in enumerate(tqdm(lines)):
212
+ video, audio_src = line.strip().split()
213
+
214
+ audio_src = os.path.join(args.data_root, audio_src)
215
+ video = os.path.join(args.data_root, video)
216
+
217
+ command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
218
+ subprocess.call(command, shell=True)
219
+ temp_audio = '../temp/temp.wav'
220
+
221
+ wav = audio.load_wav(temp_audio, 16000)
222
+ mel = audio.melspectrogram(wav)
223
+
224
+ if np.isnan(mel.reshape(-1)).sum() > 0:
225
+ raise ValueError('Mel contains nan!')
226
+
227
+ video_stream = cv2.VideoCapture(video)
228
+
229
+ fps = video_stream.get(cv2.CAP_PROP_FPS)
230
+ mel_idx_multiplier = 80./fps
231
+
232
+ full_frames = []
233
+ while 1:
234
+ still_reading, frame = video_stream.read()
235
+ if not still_reading:
236
+ video_stream.release()
237
+ break
238
+
239
+ if min(frame.shape[:-1]) > args.max_frame_res:
240
+ h, w = frame.shape[:-1]
241
+ scale_factor = min(h, w) / float(args.max_frame_res)
242
+ h = int(h/scale_factor)
243
+ w = int(w/scale_factor)
244
+
245
+ frame = cv2.resize(frame, (w, h))
246
+ full_frames.append(frame)
247
+
248
+ mel_chunks = []
249
+ i = 0
250
+ while 1:
251
+ start_idx = int(i * mel_idx_multiplier)
252
+ if start_idx + mel_step_size > len(mel[0]):
253
+ break
254
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
255
+ i += 1
256
+
257
+ if len(full_frames) < len(mel_chunks):
258
+ if args.mode == 'tts':
259
+ full_frames = increase_frames(full_frames, len(mel_chunks))
260
+ else:
261
+ raise ValueError('#Frames, audio length mismatch')
262
+
263
+ else:
264
+ full_frames = full_frames[:len(mel_chunks)]
265
+
266
+ try:
267
+ face_det_results, full_frames = face_detect(full_frames.copy())
268
+ except ValueError as e:
269
+ continue
270
+
271
+ batch_size = args.wav2lip_batch_size
272
+ gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
273
+
274
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
275
+ if i == 0:
276
+ frame_h, frame_w = full_frames[0].shape[:-1]
277
+
278
+ out = cv2.VideoWriter('../temp/result.avi',
279
+ cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
280
+
281
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
282
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
283
+
284
+ with torch.no_grad():
285
+ pred = model(mel_batch, img_batch)
286
+
287
+
288
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
289
+
290
+ for pl, f, c in zip(pred, frames, coords):
291
+ y1, y2, x1, x2 = c
292
+ pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
293
+ f[y1:y2, x1:x2] = pl
294
+ out.write(f)
295
+
296
+ out.release()
297
+
298
+ vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
299
+ command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format('../temp/temp.wav',
300
+ '../temp/result.avi', vid)
301
+ subprocess.call(command, shell=True)
302
+
303
+
304
+ if __name__ == '__main__':
305
+ main()
evaluation/scores_LSE/SyncNetInstance_calc_scores.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ #-*- coding: utf-8 -*-
3
+ # Video 25 FPS, Audio 16000HZ
4
+
5
+ import torch
6
+ import numpy
7
+ import time, pdb, argparse, subprocess, os, math, glob
8
+ import cv2
9
+ import python_speech_features
10
+
11
+ from scipy import signal
12
+ from scipy.io import wavfile
13
+ from SyncNetModel import *
14
+ from shutil import rmtree
15
+
16
+
17
+ # ==================== Get OFFSET ====================
18
+
19
+ def calc_pdist(feat1, feat2, vshift=10):
20
+
21
+ win_size = vshift*2+1
22
+
23
+ feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
24
+
25
+ dists = []
26
+
27
+ for i in range(0,len(feat1)):
28
+
29
+ dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
30
+
31
+ return dists
32
+
33
+ # ==================== MAIN DEF ====================
34
+
35
+ class SyncNetInstance(torch.nn.Module):
36
+
37
+ def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
38
+ super(SyncNetInstance, self).__init__();
39
+
40
+ self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
41
+
42
+ def evaluate(self, opt, videofile):
43
+
44
+ self.__S__.eval();
45
+
46
+ # ========== ==========
47
+ # Convert files
48
+ # ========== ==========
49
+
50
+ if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
51
+ rmtree(os.path.join(opt.tmp_dir,opt.reference))
52
+
53
+ os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
54
+
55
+ command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg')))
56
+ output = subprocess.call(command, shell=True, stdout=None)
57
+
58
+ command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav')))
59
+ output = subprocess.call(command, shell=True, stdout=None)
60
+
61
+ # ========== ==========
62
+ # Load video
63
+ # ========== ==========
64
+
65
+ images = []
66
+
67
+ flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
68
+ flist.sort()
69
+
70
+ for fname in flist:
71
+ img_input = cv2.imread(fname)
72
+ img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
73
+ images.append(img_input)
74
+
75
+ im = numpy.stack(images,axis=3)
76
+ im = numpy.expand_dims(im,axis=0)
77
+ im = numpy.transpose(im,(0,3,4,1,2))
78
+
79
+ imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
80
+
81
+ # ========== ==========
82
+ # Load audio
83
+ # ========== ==========
84
+
85
+ sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
86
+ mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
87
+ mfcc = numpy.stack([numpy.array(i) for i in mfcc])
88
+
89
+ cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
90
+ cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
91
+
92
+ # ========== ==========
93
+ # Check audio and video input length
94
+ # ========== ==========
95
+
96
+ #if (float(len(audio))/16000) != (float(len(images))/25) :
97
+ # print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
98
+
99
+ min_length = min(len(images),math.floor(len(audio)/640))
100
+
101
+ # ========== ==========
102
+ # Generate video and audio feats
103
+ # ========== ==========
104
+
105
+ lastframe = min_length-5
106
+ im_feat = []
107
+ cc_feat = []
108
+
109
+ tS = time.time()
110
+ for i in range(0,lastframe,opt.batch_size):
111
+
112
+ im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
113
+ im_in = torch.cat(im_batch,0)
114
+ im_out = self.__S__.forward_lip(im_in.cuda());
115
+ im_feat.append(im_out.data.cpu())
116
+
117
+ cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
118
+ cc_in = torch.cat(cc_batch,0)
119
+ cc_out = self.__S__.forward_aud(cc_in.cuda())
120
+ cc_feat.append(cc_out.data.cpu())
121
+
122
+ im_feat = torch.cat(im_feat,0)
123
+ cc_feat = torch.cat(cc_feat,0)
124
+
125
+ # ========== ==========
126
+ # Compute offset
127
+ # ========== ==========
128
+
129
+ #print('Compute time %.3f sec.' % (time.time()-tS))
130
+
131
+ dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
132
+ mdist = torch.mean(torch.stack(dists,1),1)
133
+
134
+ minval, minidx = torch.min(mdist,0)
135
+
136
+ offset = opt.vshift-minidx
137
+ conf = torch.median(mdist) - minval
138
+
139
+ fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
140
+ # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
141
+ fconf = torch.median(mdist).numpy() - fdist
142
+ fconfm = signal.medfilt(fconf,kernel_size=9)
143
+
144
+ numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
145
+ #print('Framewise conf: ')
146
+ #print(fconfm)
147
+ #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
148
+
149
+ dists_npy = numpy.array([ dist.numpy() for dist in dists ])
150
+ return offset.numpy(), conf.numpy(), minval.numpy()
151
+
152
+ def extract_feature(self, opt, videofile):
153
+
154
+ self.__S__.eval();
155
+
156
+ # ========== ==========
157
+ # Load video
158
+ # ========== ==========
159
+ cap = cv2.VideoCapture(videofile)
160
+
161
+ frame_num = 1;
162
+ images = []
163
+ while frame_num:
164
+ frame_num += 1
165
+ ret, image = cap.read()
166
+ if ret == 0:
167
+ break
168
+
169
+ images.append(image)
170
+
171
+ im = numpy.stack(images,axis=3)
172
+ im = numpy.expand_dims(im,axis=0)
173
+ im = numpy.transpose(im,(0,3,4,1,2))
174
+
175
+ imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
176
+
177
+ # ========== ==========
178
+ # Generate video feats
179
+ # ========== ==========
180
+
181
+ lastframe = len(images)-4
182
+ im_feat = []
183
+
184
+ tS = time.time()
185
+ for i in range(0,lastframe,opt.batch_size):
186
+
187
+ im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
188
+ im_in = torch.cat(im_batch,0)
189
+ im_out = self.__S__.forward_lipfeat(im_in.cuda());
190
+ im_feat.append(im_out.data.cpu())
191
+
192
+ im_feat = torch.cat(im_feat,0)
193
+
194
+ # ========== ==========
195
+ # Compute offset
196
+ # ========== ==========
197
+
198
+ print('Compute time %.3f sec.' % (time.time()-tS))
199
+
200
+ return im_feat
201
+
202
+
203
+ def loadParameters(self, path):
204
+ loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
205
+
206
+ self_state = self.__S__.state_dict();
207
+
208
+ for name, param in loaded_state.items():
209
+
210
+ self_state[name].copy_(param);
evaluation/scores_LSE/calculate_scores_LRS.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ #-*- coding: utf-8 -*-
3
+
4
+ import time, pdb, argparse, subprocess
5
+ import glob
6
+ import os
7
+ from tqdm import tqdm
8
+
9
+ from SyncNetInstance_calc_scores import *
10
+
11
+ # ==================== LOAD PARAMS ====================
12
+
13
+
14
+ parser = argparse.ArgumentParser(description = "SyncNet");
15
+
16
+ parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
17
+ parser.add_argument('--batch_size', type=int, default='20', help='');
18
+ parser.add_argument('--vshift', type=int, default='15', help='');
19
+ parser.add_argument('--data_root', type=str, required=True, help='');
20
+ parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
21
+ parser.add_argument('--reference', type=str, default="demo", help='');
22
+
23
+ opt = parser.parse_args();
24
+
25
+
26
+ # ==================== RUN EVALUATION ====================
27
+
28
+ s = SyncNetInstance();
29
+
30
+ s.loadParameters(opt.initial_model);
31
+ #print("Model %s loaded."%opt.initial_model);
32
+ path = os.path.join(opt.data_root, "*.mp4")
33
+
34
+ all_videos = glob.glob(path)
35
+
36
+ prog_bar = tqdm(range(len(all_videos)))
37
+ avg_confidence = 0.
38
+ avg_min_distance = 0.
39
+
40
+
41
+ for videofile_idx in prog_bar:
42
+ videofile = all_videos[videofile_idx]
43
+ offset, confidence, min_distance = s.evaluate(opt, videofile=videofile)
44
+ avg_confidence += confidence
45
+ avg_min_distance += min_distance
46
+ prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3)))
47
+ prog_bar.refresh()
48
+
49
+ print ('Average Confidence: {}'.format(avg_confidence/len(all_videos)))
50
+ print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos)))
51
+
52
+
53
+
evaluation/scores_LSE/calculate_scores_real_videos.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ #-*- coding: utf-8 -*-
3
+
4
+ import time, pdb, argparse, subprocess, pickle, os, gzip, glob
5
+
6
+ from SyncNetInstance_calc_scores import *
7
+
8
+ # ==================== PARSE ARGUMENT ====================
9
+
10
+ parser = argparse.ArgumentParser(description = "SyncNet");
11
+ parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
12
+ parser.add_argument('--batch_size', type=int, default='20', help='');
13
+ parser.add_argument('--vshift', type=int, default='15', help='');
14
+ parser.add_argument('--data_dir', type=str, default='data/work', help='');
15
+ parser.add_argument('--videofile', type=str, default='', help='');
16
+ parser.add_argument('--reference', type=str, default='', help='');
17
+ opt = parser.parse_args();
18
+
19
+ setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
20
+ setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
21
+ setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
22
+ setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
23
+
24
+
25
+ # ==================== LOAD MODEL AND FILE LIST ====================
26
+
27
+ s = SyncNetInstance();
28
+
29
+ s.loadParameters(opt.initial_model);
30
+ #print("Model %s loaded."%opt.initial_model);
31
+
32
+ flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
33
+ flist.sort()
34
+
35
+ # ==================== GET OFFSETS ====================
36
+
37
+ dists = []
38
+ for idx, fname in enumerate(flist):
39
+ offset, conf, dist = s.evaluate(opt,videofile=fname)
40
+ print (str(dist)+" "+str(conf))
41
+
42
+ # ==================== PRINT RESULTS TO FILE ====================
43
+
44
+ #with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
45
+ # pickle.dump(dists, fil)
evaluation/scores_LSE/calculate_scores_real_videos.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ rm all_scores.txt
2
+ yourfilenames=`ls $1`
3
+
4
+ for eachfile in $yourfilenames
5
+ do
6
+ python run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir
7
+ python calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir >> all_scores.txt
8
+ done
evaluation/test_filelists/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This folder contains the filelists for the new evaluation framework proposed in the paper.
2
+
3
+ ## Test filelists for LRS2, LRS3, and LRW.
4
+
5
+ This folder contains three filelists, each containing a list of names of audio-video pairs from the test sets of LRS2, LRS3, and LRW. The LRS2 and LRW filelists are strictly "Copyright BBC" and can only be used for “non-commercial research by applicants who have an agreement with the BBC to access the Lip Reading in the Wild and/or Lip Reading Sentences in the Wild datasets”. Please follow this link for more details: [https://www.bbc.co.uk/rd/projects/lip-reading-datasets](https://www.bbc.co.uk/rd/projects/lip-reading-datasets).
6
+
7
+
8
+ ## ReSynCED benchmark
9
+
10
+ The sub-folder `ReSynCED` contains filelists for our own Real-world lip-Sync Evaluation Dataset (ReSyncED).
11
+
12
+
13
+ #### Instructions on how to use the above two filelists are available in the README of the parent folder.
evaluation/test_filelists/ReSyncED/random_pairs.txt ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sachin.mp4 emma_cropped.mp4
2
+ sachin.mp4 mourinho.mp4
3
+ sachin.mp4 elon.mp4
4
+ sachin.mp4 messi2.mp4
5
+ sachin.mp4 cr1.mp4
6
+ sachin.mp4 sachin.mp4
7
+ sachin.mp4 sg.mp4
8
+ sachin.mp4 fergi.mp4
9
+ sachin.mp4 spanish_lec1.mp4
10
+ sachin.mp4 bush_small.mp4
11
+ sachin.mp4 macca_cut.mp4
12
+ sachin.mp4 ca_cropped.mp4
13
+ sachin.mp4 lecun.mp4
14
+ sachin.mp4 spanish_lec0.mp4
15
+ srk.mp4 emma_cropped.mp4
16
+ srk.mp4 mourinho.mp4
17
+ srk.mp4 elon.mp4
18
+ srk.mp4 messi2.mp4
19
+ srk.mp4 cr1.mp4
20
+ srk.mp4 srk.mp4
21
+ srk.mp4 sachin.mp4
22
+ srk.mp4 sg.mp4
23
+ srk.mp4 fergi.mp4
24
+ srk.mp4 spanish_lec1.mp4
25
+ srk.mp4 bush_small.mp4
26
+ srk.mp4 macca_cut.mp4
27
+ srk.mp4 ca_cropped.mp4
28
+ srk.mp4 guardiola.mp4
29
+ srk.mp4 lecun.mp4
30
+ srk.mp4 spanish_lec0.mp4
31
+ cr1.mp4 emma_cropped.mp4
32
+ cr1.mp4 elon.mp4
33
+ cr1.mp4 messi2.mp4
34
+ cr1.mp4 cr1.mp4
35
+ cr1.mp4 spanish_lec1.mp4
36
+ cr1.mp4 bush_small.mp4
37
+ cr1.mp4 macca_cut.mp4
38
+ cr1.mp4 ca_cropped.mp4
39
+ cr1.mp4 lecun.mp4
40
+ cr1.mp4 spanish_lec0.mp4
41
+ macca_cut.mp4 emma_cropped.mp4
42
+ macca_cut.mp4 elon.mp4
43
+ macca_cut.mp4 messi2.mp4
44
+ macca_cut.mp4 spanish_lec1.mp4
45
+ macca_cut.mp4 macca_cut.mp4
46
+ macca_cut.mp4 ca_cropped.mp4
47
+ macca_cut.mp4 spanish_lec0.mp4
48
+ lecun.mp4 emma_cropped.mp4
49
+ lecun.mp4 elon.mp4
50
+ lecun.mp4 messi2.mp4
51
+ lecun.mp4 spanish_lec1.mp4
52
+ lecun.mp4 macca_cut.mp4
53
+ lecun.mp4 ca_cropped.mp4
54
+ lecun.mp4 lecun.mp4
55
+ lecun.mp4 spanish_lec0.mp4
56
+ messi2.mp4 emma_cropped.mp4
57
+ messi2.mp4 elon.mp4
58
+ messi2.mp4 messi2.mp4
59
+ messi2.mp4 spanish_lec1.mp4
60
+ messi2.mp4 macca_cut.mp4
61
+ messi2.mp4 ca_cropped.mp4
62
+ messi2.mp4 spanish_lec0.mp4
63
+ ca_cropped.mp4 emma_cropped.mp4
64
+ ca_cropped.mp4 elon.mp4
65
+ ca_cropped.mp4 spanish_lec1.mp4
66
+ ca_cropped.mp4 ca_cropped.mp4
67
+ ca_cropped.mp4 spanish_lec0.mp4
68
+ spanish_lec1.mp4 spanish_lec1.mp4
69
+ spanish_lec1.mp4 spanish_lec0.mp4
70
+ elon.mp4 elon.mp4
71
+ elon.mp4 spanish_lec1.mp4
72
+ elon.mp4 spanish_lec0.mp4
73
+ guardiola.mp4 emma_cropped.mp4
74
+ guardiola.mp4 mourinho.mp4
75
+ guardiola.mp4 elon.mp4
76
+ guardiola.mp4 messi2.mp4
77
+ guardiola.mp4 cr1.mp4
78
+ guardiola.mp4 sachin.mp4
79
+ guardiola.mp4 sg.mp4
80
+ guardiola.mp4 fergi.mp4
81
+ guardiola.mp4 spanish_lec1.mp4
82
+ guardiola.mp4 bush_small.mp4
83
+ guardiola.mp4 macca_cut.mp4
84
+ guardiola.mp4 ca_cropped.mp4
85
+ guardiola.mp4 guardiola.mp4
86
+ guardiola.mp4 lecun.mp4
87
+ guardiola.mp4 spanish_lec0.mp4
88
+ fergi.mp4 emma_cropped.mp4
89
+ fergi.mp4 mourinho.mp4
90
+ fergi.mp4 elon.mp4
91
+ fergi.mp4 messi2.mp4
92
+ fergi.mp4 cr1.mp4
93
+ fergi.mp4 sachin.mp4
94
+ fergi.mp4 sg.mp4
95
+ fergi.mp4 fergi.mp4
96
+ fergi.mp4 spanish_lec1.mp4
97
+ fergi.mp4 bush_small.mp4
98
+ fergi.mp4 macca_cut.mp4
99
+ fergi.mp4 ca_cropped.mp4
100
+ fergi.mp4 lecun.mp4
101
+ fergi.mp4 spanish_lec0.mp4
102
+ spanish.mp4 emma_cropped.mp4
103
+ spanish.mp4 spanish.mp4
104
+ spanish.mp4 mourinho.mp4
105
+ spanish.mp4 elon.mp4
106
+ spanish.mp4 messi2.mp4
107
+ spanish.mp4 cr1.mp4
108
+ spanish.mp4 srk.mp4
109
+ spanish.mp4 sachin.mp4
110
+ spanish.mp4 sg.mp4
111
+ spanish.mp4 fergi.mp4
112
+ spanish.mp4 spanish_lec1.mp4
113
+ spanish.mp4 bush_small.mp4
114
+ spanish.mp4 macca_cut.mp4
115
+ spanish.mp4 ca_cropped.mp4
116
+ spanish.mp4 guardiola.mp4
117
+ spanish.mp4 lecun.mp4
118
+ spanish.mp4 spanish_lec0.mp4
119
+ bush_small.mp4 emma_cropped.mp4
120
+ bush_small.mp4 elon.mp4
121
+ bush_small.mp4 messi2.mp4
122
+ bush_small.mp4 spanish_lec1.mp4
123
+ bush_small.mp4 bush_small.mp4
124
+ bush_small.mp4 macca_cut.mp4
125
+ bush_small.mp4 ca_cropped.mp4
126
+ bush_small.mp4 lecun.mp4
127
+ bush_small.mp4 spanish_lec0.mp4
128
+ emma_cropped.mp4 emma_cropped.mp4
129
+ emma_cropped.mp4 elon.mp4
130
+ emma_cropped.mp4 spanish_lec1.mp4
131
+ emma_cropped.mp4 spanish_lec0.mp4
132
+ sg.mp4 emma_cropped.mp4
133
+ sg.mp4 mourinho.mp4
134
+ sg.mp4 elon.mp4
135
+ sg.mp4 messi2.mp4
136
+ sg.mp4 cr1.mp4
137
+ sg.mp4 sachin.mp4
138
+ sg.mp4 sg.mp4
139
+ sg.mp4 fergi.mp4
140
+ sg.mp4 spanish_lec1.mp4
141
+ sg.mp4 bush_small.mp4
142
+ sg.mp4 macca_cut.mp4
143
+ sg.mp4 ca_cropped.mp4
144
+ sg.mp4 lecun.mp4
145
+ sg.mp4 spanish_lec0.mp4
146
+ spanish_lec0.mp4 spanish_lec0.mp4
147
+ mourinho.mp4 emma_cropped.mp4
148
+ mourinho.mp4 mourinho.mp4
149
+ mourinho.mp4 elon.mp4
150
+ mourinho.mp4 messi2.mp4
151
+ mourinho.mp4 cr1.mp4
152
+ mourinho.mp4 sachin.mp4
153
+ mourinho.mp4 sg.mp4
154
+ mourinho.mp4 fergi.mp4
155
+ mourinho.mp4 spanish_lec1.mp4
156
+ mourinho.mp4 bush_small.mp4
157
+ mourinho.mp4 macca_cut.mp4
158
+ mourinho.mp4 ca_cropped.mp4
159
+ mourinho.mp4 lecun.mp4
160
+ mourinho.mp4 spanish_lec0.mp4
evaluation/test_filelists/ReSyncED/tts_pairs.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ adam_1.mp4 andreng_optimization.wav
2
+ agad_2.mp4 agad_2.wav
3
+ agad_1.mp4 agad_1.wav
4
+ agad_3.mp4 agad_3.wav
5
+ rms_prop_1.mp4 rms_prop_tts.wav
6
+ tf_1.mp4 tf_1.wav
7
+ tf_2.mp4 tf_2.wav
8
+ andrew_ng_ai_business.mp4 andrewng_business_tts.wav
9
+ covid_autopsy_1.mp4 autopsy_tts.wav
10
+ news_1.mp4 news_tts.wav
11
+ andrew_ng_fund_1.mp4 andrewng_ai_fund.wav
12
+ covid_treatments_1.mp4 covid_tts.wav
13
+ pytorch_v_tf.mp4 pytorch_vs_tf_eng.wav
14
+ pytorch_1.mp4 pytorch.wav
15
+ pkb_1.mp4 pkb_1.wav
16
+ ss_1.mp4 ss_1.wav
17
+ carlsen_1.mp4 carlsen_eng.wav
18
+ french.mp4 french.wav
evaluation/test_filelists/lrs2.txt ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/test_filelists/lrs3.txt ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/test_filelists/lrw.txt ADDED
The diff for this file is too large to render. See raw diff
 
examples/driven_audio/RD_Radio31_000.wav ADDED
Binary file (512 kB). View file
 
examples/driven_audio/RD_Radio34_002.wav ADDED
Binary file (512 kB). View file
 
examples/driven_audio/RD_Radio36_000.wav ADDED
Binary file (512 kB). View file
 
examples/driven_audio/RD_Radio40_000.wav ADDED
Binary file (512 kB). View file
 
examples/driven_audio/bus_chinese.wav ADDED
Binary file (652 kB). View file
 
examples/driven_audio/chinese_news.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b0f4d313a1ca671bc4831d60bcf0c12225efbffe6c0e93e54fbfe9bcd4021cb
3
+ size 1536078
examples/driven_audio/chinese_poem1.wav ADDED
Binary file (263 kB). View file
 
examples/driven_audio/chinese_poem2.wav ADDED
Binary file (461 kB). View file
 
examples/driven_audio/deyu.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba1839c57770a2ab0b593ce814344bfd4d750da02acc9be9e8cf5b9113a0f88a
3
+ size 2694784
examples/driven_audio/eluosi.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4a3593815dc7b68c256672baa61934c9479efa770af2065fb0886f02713606e
3
+ size 1786672
examples/driven_audio/fayu.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16ebd13626ae4171030b4ea05cceef06078483c352e4b68d469fc2a52bfffceb
3
+ size 1940428
examples/driven_audio/imagine.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2db410217e074d91ae6011e1c5dc0b94f02d05d381c50af8e54253eeacad17d2
3
+ size 1618510
examples/driven_audio/itosinger1.wav ADDED
Binary file (500 kB). View file
 
examples/driven_audio/japanese.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3db5426d0b158799e2be4f609b11f75bfbd4affffe18e9a1c8e6f241fcdedcfc
3
+ size 2622712
examples/ref_video/WDA_AlexandriaOcasioCortez_000.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a85242c3fc4d50e2202cea393b9e7ee59019759b68e78e26a254d528c22615a7
3
+ size 2257667
examples/ref_video/WDA_KatieHill_000.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fbb4cfd64eedc49b170c441714a9c4fd5e2c2f8a11592070ad89fbd257f2817
3
+ size 3548230
examples/source_image/art_0.png ADDED