Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
4085332
1
Parent(s):
638c0b5
Update app
Browse files
app.py
CHANGED
@@ -522,7 +522,9 @@ def resample_video(video_file, video_fname, result_folder):
|
|
522 |
- result_folder (string) : Path of the folder to save the resampled video
|
523 |
Returns:
|
524 |
- video_file_25fps (string) : Path of the resampled video file
|
|
|
525 |
'''
|
|
|
526 |
video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
|
527 |
|
528 |
# Resample the video to 25 fps
|
@@ -547,10 +549,6 @@ def load_checkpoint(path, model):
|
|
547 |
|
548 |
# Load the checkpoint
|
549 |
checkpoint = torch.load(path, map_location="cpu")
|
550 |
-
# if use_cuda:
|
551 |
-
# checkpoint = torch.load(path)
|
552 |
-
# else:
|
553 |
-
# checkpoint = torch.load(path, map_location="cpu")
|
554 |
|
555 |
s = checkpoint["state_dict"]
|
556 |
new_s = {}
|
@@ -559,8 +557,6 @@ def load_checkpoint(path, model):
|
|
559 |
new_s[k.replace('module.', '')] = v
|
560 |
model.load_state_dict(new_s)
|
561 |
|
562 |
-
# model.to(device)
|
563 |
-
|
564 |
print("Loaded checkpoint from: {}".format(path))
|
565 |
|
566 |
return model.eval()
|
@@ -688,6 +684,7 @@ def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_fr
|
|
688 |
Args:
|
689 |
- input_frames (list) : List of frames extracted from the video
|
690 |
- kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
|
|
|
691 |
- stride (int) : Stride to extract the frames
|
692 |
- window_frames (int) : Number of frames in each window that is given as input to the model
|
693 |
- width (int) : Width of the frames
|
@@ -745,10 +742,8 @@ def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_fr
|
|
745 |
input_frames = np.array(input_frames_masked) / 255.
|
746 |
if asd:
|
747 |
input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
|
748 |
-
# print("Input images full: ", input_frames.shape) # num_framesx270x480x3
|
749 |
|
750 |
input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
|
751 |
-
# print("Input images window: ", input_frames.shape) # Tx25x270x480x3
|
752 |
print("Successfully created masked input frames")
|
753 |
|
754 |
num_frames = input_frames.shape[0]
|
@@ -766,6 +761,7 @@ def load_spectrograms(wav_file, asd=False, num_frames=None, window_frames=25, st
|
|
766 |
|
767 |
Args:
|
768 |
- wav_file (string) : Path of the extracted audio file
|
|
|
769 |
- num_frames (int) : Number of frames to extract
|
770 |
- window_frames (int) : Number of frames in each window that is given as input to the model
|
771 |
- stride (int) : Stride to extract the audio frames
|
@@ -919,6 +915,7 @@ def generate_video(frames, audio_file, video_fname):
|
|
919 |
- video_fname (string) : Path of the video file
|
920 |
Returns:
|
921 |
- video_output (string) : Path of the video file
|
|
|
922 |
'''
|
923 |
|
924 |
fname = 'inference.avi'
|
@@ -962,6 +959,7 @@ def sync_correct_video(video_path, frames, wav_file, offset, result_folder, samp
|
|
962 |
- fps (int) : Frames per second of the video
|
963 |
Returns:
|
964 |
- video_output (string) : Path of the video file
|
|
|
965 |
'''
|
966 |
|
967 |
if offset == 0:
|
@@ -1060,6 +1058,7 @@ def extract_audio(video, result_folder):
|
|
1060 |
- result_folder (string) : Path of the folder to save the extracted audio file
|
1061 |
Returns:
|
1062 |
- wav_file (string) : Path of the extracted audio file
|
|
|
1063 |
'''
|
1064 |
|
1065 |
wav_file = os.path.join(result_folder, "audio.wav")
|
@@ -1083,40 +1082,13 @@ def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_
|
|
1083 |
- video_sequences (array) : Array of video frames to be used as input to the model
|
1084 |
- audio_sequences (array) : Array of audio frames to be used as input to the model
|
1085 |
- model (object) : Model object
|
|
|
1086 |
- calc_aud_emb (bool) : Flag to calculate the audio embedding
|
1087 |
Returns:
|
1088 |
- video_emb (array) : Video embedding
|
1089 |
- audio_emb (array) : Audio embedding
|
1090 |
'''
|
1091 |
|
1092 |
-
# video_emb = []
|
1093 |
-
# audio_emb = []
|
1094 |
-
|
1095 |
-
# # model = model.to(device)
|
1096 |
-
|
1097 |
-
# for i in tqdm(range(0, len(video_sequences), batch_size)):
|
1098 |
-
# video_inp = video_sequences[i:i+batch_size, ]
|
1099 |
-
# vid_emb = model.forward_vid(video_inp, return_feats=False)
|
1100 |
-
# vid_emb = torch.mean(vid_emb, axis=-1)
|
1101 |
-
|
1102 |
-
# video_emb.append(vid_emb.detach().cpu())
|
1103 |
-
|
1104 |
-
# if calc_aud_emb:
|
1105 |
-
# audio_inp = audio_sequences[i:i+batch_size, ]
|
1106 |
-
# aud_emb = model.forward_aud(audio_inp)
|
1107 |
-
# audio_emb.append(aud_emb.detach().cpu())
|
1108 |
-
|
1109 |
-
# # torch.cuda.empty_cache()
|
1110 |
-
|
1111 |
-
# print("Extracted embeddings: ", len(video_emb), len(audio_emb))
|
1112 |
-
|
1113 |
-
|
1114 |
-
# if calc_aud_emb==True:
|
1115 |
-
# print("returning audio and video embeddings...")
|
1116 |
-
# return video_emb, audio_emb
|
1117 |
-
|
1118 |
-
# return video_emb
|
1119 |
-
|
1120 |
video_emb = []
|
1121 |
audio_emb = []
|
1122 |
|
@@ -1127,23 +1099,19 @@ def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_
|
|
1127 |
if not asd:
|
1128 |
vid_emb = vid_emb.unsqueeze(-1)
|
1129 |
|
1130 |
-
# video_emb.append(vid_emb.detach())
|
1131 |
video_emb.extend(vid_emb.detach().cpu().numpy())
|
1132 |
|
1133 |
if calc_aud_emb:
|
1134 |
audio_inp = audio_sequences[i:i+batch_size, ]
|
1135 |
aud_emb = model.forward_aud(audio_inp)
|
1136 |
-
# audio_emb.append(aud_emb.detach())
|
1137 |
audio_emb.extend(aud_emb.detach().cpu().numpy())
|
1138 |
|
1139 |
torch.cuda.empty_cache()
|
1140 |
|
1141 |
-
# video_emb = torch.cat(video_emb, dim=0)
|
1142 |
video_emb = np.array(video_emb)
|
1143 |
print("Video Embedding Shape: ", video_emb.shape)
|
1144 |
|
1145 |
if calc_aud_emb:
|
1146 |
-
# audio_emb = torch.cat(audio_emb, dim=0)
|
1147 |
audio_emb = np.array(audio_emb)
|
1148 |
print("Audio Embedding Shape: ", audio_emb.shape)
|
1149 |
|
@@ -1162,8 +1130,11 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
|
|
1162 |
- all_video_embeddings (array) : Array of video embeddings of all speakers
|
1163 |
- audio_embedding (array) : Audio embedding
|
1164 |
- global_score (bool) : Flag to calculate the global score
|
|
|
|
|
1165 |
Returns:
|
1166 |
- pred_speaker (list) : List of active speakers in each frame
|
|
|
1167 |
'''
|
1168 |
|
1169 |
cos = nn.CosineSimilarity(dim=1)
|
@@ -1217,6 +1188,7 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
|
|
1217 |
- result_folder (string) : Path of the result folder to save the output video
|
1218 |
Returns:
|
1219 |
- video_output (string) : Path of the output video
|
|
|
1220 |
'''
|
1221 |
|
1222 |
try:
|
@@ -1245,6 +1217,16 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
|
|
1245 |
|
1246 |
def preprocess_asd(video_path, result_folder_input):
|
1247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1248 |
file = video_path
|
1249 |
|
1250 |
data_dir = os.path.join(result_folder_input, 'temp')
|
@@ -1270,6 +1252,18 @@ def preprocess_asd(video_path, result_folder_input):
|
|
1270 |
|
1271 |
def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
1272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1273 |
try:
|
1274 |
# Extract the video filename
|
1275 |
video_fname = os.path.basename(video_path.split(".")[0])
|
@@ -1322,7 +1316,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
1322 |
msg = "Error: The input video is too short. Please use a longer input video."
|
1323 |
return None, msg
|
1324 |
|
1325 |
-
# if apply_preprocess:
|
1326 |
# Load keypoints and check if gestures are visible
|
1327 |
kp_dict, status = get_keypoints(frames)
|
1328 |
if status != "success":
|
@@ -1332,8 +1325,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
1332 |
status = check_visible_gestures(kp_dict)
|
1333 |
if status != "success":
|
1334 |
return None, status
|
1335 |
-
# else:
|
1336 |
-
# kp_dict = None
|
1337 |
|
1338 |
# Load RGB frames
|
1339 |
rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
|
@@ -1366,9 +1357,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
1366 |
# Extract embeddings
|
1367 |
print("Obtaining audio and video embeddings...")
|
1368 |
video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
|
1369 |
-
print("Obtained embeddings")
|
1370 |
-
# video_emb = torch.cat(video_emb, dim=0)
|
1371 |
-
# audio_emb = torch.cat(audio_emb, dim=0)
|
1372 |
|
1373 |
# L2 normalize embeddings
|
1374 |
print("Normalizing embeddings")
|
@@ -1408,6 +1396,19 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
1408 |
|
1409 |
|
1410 |
def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1411 |
try:
|
1412 |
# Extract the video filename
|
1413 |
video_fname = os.path.basename(video_path.split(".")[0])
|
@@ -1689,6 +1690,13 @@ if __name__ == "__main__":
|
|
1689 |
<li>Input videos with clearly visible gestures work better.</li>
|
1690 |
</ul>
|
1691 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1692 |
</div>
|
1693 |
"""
|
1694 |
|
|
|
522 |
- result_folder (string) : Path of the folder to save the resampled video
|
523 |
Returns:
|
524 |
- video_file_25fps (string) : Path of the resampled video file
|
525 |
+
- msg (string) : Message to be returned
|
526 |
'''
|
527 |
+
|
528 |
video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
|
529 |
|
530 |
# Resample the video to 25 fps
|
|
|
549 |
|
550 |
# Load the checkpoint
|
551 |
checkpoint = torch.load(path, map_location="cpu")
|
|
|
|
|
|
|
|
|
552 |
|
553 |
s = checkpoint["state_dict"]
|
554 |
new_s = {}
|
|
|
557 |
new_s[k.replace('module.', '')] = v
|
558 |
model.load_state_dict(new_s)
|
559 |
|
|
|
|
|
560 |
print("Loaded checkpoint from: {}".format(path))
|
561 |
|
562 |
return model.eval()
|
|
|
684 |
Args:
|
685 |
- input_frames (list) : List of frames extracted from the video
|
686 |
- kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
|
687 |
+
- asd (bool) : Whether to use padding (needed for active speaker detection task) or not
|
688 |
- stride (int) : Stride to extract the frames
|
689 |
- window_frames (int) : Number of frames in each window that is given as input to the model
|
690 |
- width (int) : Width of the frames
|
|
|
742 |
input_frames = np.array(input_frames_masked) / 255.
|
743 |
if asd:
|
744 |
input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
|
|
|
745 |
|
746 |
input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
|
|
|
747 |
print("Successfully created masked input frames")
|
748 |
|
749 |
num_frames = input_frames.shape[0]
|
|
|
761 |
|
762 |
Args:
|
763 |
- wav_file (string) : Path of the extracted audio file
|
764 |
+
- asd (bool) : Whether to use padding (needed for active speaker detection task) or not
|
765 |
- num_frames (int) : Number of frames to extract
|
766 |
- window_frames (int) : Number of frames in each window that is given as input to the model
|
767 |
- stride (int) : Stride to extract the audio frames
|
|
|
915 |
- video_fname (string) : Path of the video file
|
916 |
Returns:
|
917 |
- video_output (string) : Path of the video file
|
918 |
+
- msg (string) : Message to be returned
|
919 |
'''
|
920 |
|
921 |
fname = 'inference.avi'
|
|
|
959 |
- fps (int) : Frames per second of the video
|
960 |
Returns:
|
961 |
- video_output (string) : Path of the video file
|
962 |
+
- msg (string) : Message to be returned
|
963 |
'''
|
964 |
|
965 |
if offset == 0:
|
|
|
1058 |
- result_folder (string) : Path of the folder to save the extracted audio file
|
1059 |
Returns:
|
1060 |
- wav_file (string) : Path of the extracted audio file
|
1061 |
+
- msg (string) : Message to be returned
|
1062 |
'''
|
1063 |
|
1064 |
wav_file = os.path.join(result_folder, "audio.wav")
|
|
|
1082 |
- video_sequences (array) : Array of video frames to be used as input to the model
|
1083 |
- audio_sequences (array) : Array of audio frames to be used as input to the model
|
1084 |
- model (object) : Model object
|
1085 |
+
- asd (bool) : Active speaker detection task flag to return the correct dimensions for the embeddings
|
1086 |
- calc_aud_emb (bool) : Flag to calculate the audio embedding
|
1087 |
Returns:
|
1088 |
- video_emb (array) : Video embedding
|
1089 |
- audio_emb (array) : Audio embedding
|
1090 |
'''
|
1091 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1092 |
video_emb = []
|
1093 |
audio_emb = []
|
1094 |
|
|
|
1099 |
if not asd:
|
1100 |
vid_emb = vid_emb.unsqueeze(-1)
|
1101 |
|
|
|
1102 |
video_emb.extend(vid_emb.detach().cpu().numpy())
|
1103 |
|
1104 |
if calc_aud_emb:
|
1105 |
audio_inp = audio_sequences[i:i+batch_size, ]
|
1106 |
aud_emb = model.forward_aud(audio_inp)
|
|
|
1107 |
audio_emb.extend(aud_emb.detach().cpu().numpy())
|
1108 |
|
1109 |
torch.cuda.empty_cache()
|
1110 |
|
|
|
1111 |
video_emb = np.array(video_emb)
|
1112 |
print("Video Embedding Shape: ", video_emb.shape)
|
1113 |
|
1114 |
if calc_aud_emb:
|
|
|
1115 |
audio_emb = np.array(audio_emb)
|
1116 |
print("Audio Embedding Shape: ", audio_emb.shape)
|
1117 |
|
|
|
1130 |
- all_video_embeddings (array) : Array of video embeddings of all speakers
|
1131 |
- audio_embedding (array) : Audio embedding
|
1132 |
- global_score (bool) : Flag to calculate the global score
|
1133 |
+
- num_avg_frames (int) : Number of frames to average the scores
|
1134 |
+
- model (object) : Model object
|
1135 |
Returns:
|
1136 |
- pred_speaker (list) : List of active speakers in each frame
|
1137 |
+
- num_avg_frames (int) : Number of frames to average the scores
|
1138 |
'''
|
1139 |
|
1140 |
cos = nn.CosineSimilarity(dim=1)
|
|
|
1188 |
- result_folder (string) : Path of the result folder to save the output video
|
1189 |
Returns:
|
1190 |
- video_output (string) : Path of the output video
|
1191 |
+
- msg (string) : Message to be returned
|
1192 |
'''
|
1193 |
|
1194 |
try:
|
|
|
1217 |
|
1218 |
def preprocess_asd(video_path, result_folder_input):
|
1219 |
|
1220 |
+
'''
|
1221 |
+
This function preprocesses the video for the active speaker detection task
|
1222 |
+
|
1223 |
+
Args:
|
1224 |
+
- video_path (string) : Path of the video file
|
1225 |
+
- result_folder_input (string) : Path of the folder to save the input video
|
1226 |
+
Returns:
|
1227 |
+
- msg (string) : Message to be returned
|
1228 |
+
'''
|
1229 |
+
|
1230 |
file = video_path
|
1231 |
|
1232 |
data_dir = os.path.join(result_folder_input, 'temp')
|
|
|
1252 |
|
1253 |
def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
1254 |
|
1255 |
+
'''
|
1256 |
+
This function processes the video for the sync offset prediction task
|
1257 |
+
|
1258 |
+
Args:
|
1259 |
+
- video_path (string) : Path of the video file
|
1260 |
+
- num_avg_frames (int) : Number of frames to average the scores
|
1261 |
+
- apply_preprocess (bool) : Flag to apply the pre-processing steps or not
|
1262 |
+
Returns:
|
1263 |
+
- video_output (string) : Path of the output video
|
1264 |
+
- msg (string) : Message to be returned
|
1265 |
+
'''
|
1266 |
+
|
1267 |
try:
|
1268 |
# Extract the video filename
|
1269 |
video_fname = os.path.basename(video_path.split(".")[0])
|
|
|
1316 |
msg = "Error: The input video is too short. Please use a longer input video."
|
1317 |
return None, msg
|
1318 |
|
|
|
1319 |
# Load keypoints and check if gestures are visible
|
1320 |
kp_dict, status = get_keypoints(frames)
|
1321 |
if status != "success":
|
|
|
1325 |
status = check_visible_gestures(kp_dict)
|
1326 |
if status != "success":
|
1327 |
return None, status
|
|
|
|
|
1328 |
|
1329 |
# Load RGB frames
|
1330 |
rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
|
|
|
1357 |
# Extract embeddings
|
1358 |
print("Obtaining audio and video embeddings...")
|
1359 |
video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
|
|
|
|
|
|
|
1360 |
|
1361 |
# L2 normalize embeddings
|
1362 |
print("Normalizing embeddings")
|
|
|
1396 |
|
1397 |
|
1398 |
def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
1399 |
+
|
1400 |
+
'''
|
1401 |
+
This function processes the video for the active speaker detection task
|
1402 |
+
|
1403 |
+
Args:
|
1404 |
+
- video_path (string) : Path of the video file
|
1405 |
+
- global_speaker (string) : Flag to use global or per-frame predictions
|
1406 |
+
- num_avg_frames (int) : Number of frames to average the scores
|
1407 |
+
Returns:
|
1408 |
+
- video_output (string) : Path of the output video
|
1409 |
+
- msg (string) : Message to be returned
|
1410 |
+
'''
|
1411 |
+
|
1412 |
try:
|
1413 |
# Extract the video filename
|
1414 |
video_fname = os.path.basename(video_path.split(".")[0])
|
|
|
1690 |
<li>Input videos with clearly visible gestures work better.</li>
|
1691 |
</ul>
|
1692 |
|
1693 |
+
Inference time:
|
1694 |
+
<ul>
|
1695 |
+
<li>Synchronization-correction: ~1 minute for a 10-second video</li>
|
1696 |
+
<li>Active-speaker-detection: ~2 minutes for a 10-second video</li>
|
1697 |
+
</ul>
|
1698 |
+
Note: Occasionally, there may be a delay in acquiring a GPU, as the model runs on a free community GPU from ZeroGPU.
|
1699 |
+
|
1700 |
</div>
|
1701 |
"""
|
1702 |
|