sindhuhegde commited on
Commit
4085332
·
1 Parent(s): 638c0b5

Update app

Browse files
Files changed (1) hide show
  1. app.py +54 -46
app.py CHANGED
@@ -522,7 +522,9 @@ def resample_video(video_file, video_fname, result_folder):
522
  - result_folder (string) : Path of the folder to save the resampled video
523
  Returns:
524
  - video_file_25fps (string) : Path of the resampled video file
 
525
  '''
 
526
  video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
527
 
528
  # Resample the video to 25 fps
@@ -547,10 +549,6 @@ def load_checkpoint(path, model):
547
 
548
  # Load the checkpoint
549
  checkpoint = torch.load(path, map_location="cpu")
550
- # if use_cuda:
551
- # checkpoint = torch.load(path)
552
- # else:
553
- # checkpoint = torch.load(path, map_location="cpu")
554
 
555
  s = checkpoint["state_dict"]
556
  new_s = {}
@@ -559,8 +557,6 @@ def load_checkpoint(path, model):
559
  new_s[k.replace('module.', '')] = v
560
  model.load_state_dict(new_s)
561
 
562
- # model.to(device)
563
-
564
  print("Loaded checkpoint from: {}".format(path))
565
 
566
  return model.eval()
@@ -688,6 +684,7 @@ def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_fr
688
  Args:
689
  - input_frames (list) : List of frames extracted from the video
690
  - kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
 
691
  - stride (int) : Stride to extract the frames
692
  - window_frames (int) : Number of frames in each window that is given as input to the model
693
  - width (int) : Width of the frames
@@ -745,10 +742,8 @@ def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_fr
745
  input_frames = np.array(input_frames_masked) / 255.
746
  if asd:
747
  input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
748
- # print("Input images full: ", input_frames.shape) # num_framesx270x480x3
749
 
750
  input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
751
- # print("Input images window: ", input_frames.shape) # Tx25x270x480x3
752
  print("Successfully created masked input frames")
753
 
754
  num_frames = input_frames.shape[0]
@@ -766,6 +761,7 @@ def load_spectrograms(wav_file, asd=False, num_frames=None, window_frames=25, st
766
 
767
  Args:
768
  - wav_file (string) : Path of the extracted audio file
 
769
  - num_frames (int) : Number of frames to extract
770
  - window_frames (int) : Number of frames in each window that is given as input to the model
771
  - stride (int) : Stride to extract the audio frames
@@ -919,6 +915,7 @@ def generate_video(frames, audio_file, video_fname):
919
  - video_fname (string) : Path of the video file
920
  Returns:
921
  - video_output (string) : Path of the video file
 
922
  '''
923
 
924
  fname = 'inference.avi'
@@ -962,6 +959,7 @@ def sync_correct_video(video_path, frames, wav_file, offset, result_folder, samp
962
  - fps (int) : Frames per second of the video
963
  Returns:
964
  - video_output (string) : Path of the video file
 
965
  '''
966
 
967
  if offset == 0:
@@ -1060,6 +1058,7 @@ def extract_audio(video, result_folder):
1060
  - result_folder (string) : Path of the folder to save the extracted audio file
1061
  Returns:
1062
  - wav_file (string) : Path of the extracted audio file
 
1063
  '''
1064
 
1065
  wav_file = os.path.join(result_folder, "audio.wav")
@@ -1083,40 +1082,13 @@ def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_
1083
  - video_sequences (array) : Array of video frames to be used as input to the model
1084
  - audio_sequences (array) : Array of audio frames to be used as input to the model
1085
  - model (object) : Model object
 
1086
  - calc_aud_emb (bool) : Flag to calculate the audio embedding
1087
  Returns:
1088
  - video_emb (array) : Video embedding
1089
  - audio_emb (array) : Audio embedding
1090
  '''
1091
 
1092
- # video_emb = []
1093
- # audio_emb = []
1094
-
1095
- # # model = model.to(device)
1096
-
1097
- # for i in tqdm(range(0, len(video_sequences), batch_size)):
1098
- # video_inp = video_sequences[i:i+batch_size, ]
1099
- # vid_emb = model.forward_vid(video_inp, return_feats=False)
1100
- # vid_emb = torch.mean(vid_emb, axis=-1)
1101
-
1102
- # video_emb.append(vid_emb.detach().cpu())
1103
-
1104
- # if calc_aud_emb:
1105
- # audio_inp = audio_sequences[i:i+batch_size, ]
1106
- # aud_emb = model.forward_aud(audio_inp)
1107
- # audio_emb.append(aud_emb.detach().cpu())
1108
-
1109
- # # torch.cuda.empty_cache()
1110
-
1111
- # print("Extracted embeddings: ", len(video_emb), len(audio_emb))
1112
-
1113
-
1114
- # if calc_aud_emb==True:
1115
- # print("returning audio and video embeddings...")
1116
- # return video_emb, audio_emb
1117
-
1118
- # return video_emb
1119
-
1120
  video_emb = []
1121
  audio_emb = []
1122
 
@@ -1127,23 +1099,19 @@ def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_
1127
  if not asd:
1128
  vid_emb = vid_emb.unsqueeze(-1)
1129
 
1130
- # video_emb.append(vid_emb.detach())
1131
  video_emb.extend(vid_emb.detach().cpu().numpy())
1132
 
1133
  if calc_aud_emb:
1134
  audio_inp = audio_sequences[i:i+batch_size, ]
1135
  aud_emb = model.forward_aud(audio_inp)
1136
- # audio_emb.append(aud_emb.detach())
1137
  audio_emb.extend(aud_emb.detach().cpu().numpy())
1138
 
1139
  torch.cuda.empty_cache()
1140
 
1141
- # video_emb = torch.cat(video_emb, dim=0)
1142
  video_emb = np.array(video_emb)
1143
  print("Video Embedding Shape: ", video_emb.shape)
1144
 
1145
  if calc_aud_emb:
1146
- # audio_emb = torch.cat(audio_emb, dim=0)
1147
  audio_emb = np.array(audio_emb)
1148
  print("Audio Embedding Shape: ", audio_emb.shape)
1149
 
@@ -1162,8 +1130,11 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
1162
  - all_video_embeddings (array) : Array of video embeddings of all speakers
1163
  - audio_embedding (array) : Audio embedding
1164
  - global_score (bool) : Flag to calculate the global score
 
 
1165
  Returns:
1166
  - pred_speaker (list) : List of active speakers in each frame
 
1167
  '''
1168
 
1169
  cos = nn.CosineSimilarity(dim=1)
@@ -1217,6 +1188,7 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
1217
  - result_folder (string) : Path of the result folder to save the output video
1218
  Returns:
1219
  - video_output (string) : Path of the output video
 
1220
  '''
1221
 
1222
  try:
@@ -1245,6 +1217,16 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
1245
 
1246
  def preprocess_asd(video_path, result_folder_input):
1247
 
 
 
 
 
 
 
 
 
 
 
1248
  file = video_path
1249
 
1250
  data_dir = os.path.join(result_folder_input, 'temp')
@@ -1270,6 +1252,18 @@ def preprocess_asd(video_path, result_folder_input):
1270
 
1271
  def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1272
 
 
 
 
 
 
 
 
 
 
 
 
 
1273
  try:
1274
  # Extract the video filename
1275
  video_fname = os.path.basename(video_path.split(".")[0])
@@ -1322,7 +1316,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1322
  msg = "Error: The input video is too short. Please use a longer input video."
1323
  return None, msg
1324
 
1325
- # if apply_preprocess:
1326
  # Load keypoints and check if gestures are visible
1327
  kp_dict, status = get_keypoints(frames)
1328
  if status != "success":
@@ -1332,8 +1325,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1332
  status = check_visible_gestures(kp_dict)
1333
  if status != "success":
1334
  return None, status
1335
- # else:
1336
- # kp_dict = None
1337
 
1338
  # Load RGB frames
1339
  rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
@@ -1366,9 +1357,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1366
  # Extract embeddings
1367
  print("Obtaining audio and video embeddings...")
1368
  video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
1369
- print("Obtained embeddings")
1370
- # video_emb = torch.cat(video_emb, dim=0)
1371
- # audio_emb = torch.cat(audio_emb, dim=0)
1372
 
1373
  # L2 normalize embeddings
1374
  print("Normalizing embeddings")
@@ -1408,6 +1396,19 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1408
 
1409
 
1410
  def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 
 
 
 
 
 
 
 
 
 
 
 
 
1411
  try:
1412
  # Extract the video filename
1413
  video_fname = os.path.basename(video_path.split(".")[0])
@@ -1689,6 +1690,13 @@ if __name__ == "__main__":
1689
  <li>Input videos with clearly visible gestures work better.</li>
1690
  </ul>
1691
 
 
 
 
 
 
 
 
1692
  </div>
1693
  """
1694
 
 
522
  - result_folder (string) : Path of the folder to save the resampled video
523
  Returns:
524
  - video_file_25fps (string) : Path of the resampled video file
525
+ - msg (string) : Message to be returned
526
  '''
527
+
528
  video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
529
 
530
  # Resample the video to 25 fps
 
549
 
550
  # Load the checkpoint
551
  checkpoint = torch.load(path, map_location="cpu")
 
 
 
 
552
 
553
  s = checkpoint["state_dict"]
554
  new_s = {}
 
557
  new_s[k.replace('module.', '')] = v
558
  model.load_state_dict(new_s)
559
 
 
 
560
  print("Loaded checkpoint from: {}".format(path))
561
 
562
  return model.eval()
 
684
  Args:
685
  - input_frames (list) : List of frames extracted from the video
686
  - kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
687
+ - asd (bool) : Whether to use padding (needed for active speaker detection task) or not
688
  - stride (int) : Stride to extract the frames
689
  - window_frames (int) : Number of frames in each window that is given as input to the model
690
  - width (int) : Width of the frames
 
742
  input_frames = np.array(input_frames_masked) / 255.
743
  if asd:
744
  input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
 
745
 
746
  input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
 
747
  print("Successfully created masked input frames")
748
 
749
  num_frames = input_frames.shape[0]
 
761
 
762
  Args:
763
  - wav_file (string) : Path of the extracted audio file
764
+ - asd (bool) : Whether to use padding (needed for active speaker detection task) or not
765
  - num_frames (int) : Number of frames to extract
766
  - window_frames (int) : Number of frames in each window that is given as input to the model
767
  - stride (int) : Stride to extract the audio frames
 
915
  - video_fname (string) : Path of the video file
916
  Returns:
917
  - video_output (string) : Path of the video file
918
+ - msg (string) : Message to be returned
919
  '''
920
 
921
  fname = 'inference.avi'
 
959
  - fps (int) : Frames per second of the video
960
  Returns:
961
  - video_output (string) : Path of the video file
962
+ - msg (string) : Message to be returned
963
  '''
964
 
965
  if offset == 0:
 
1058
  - result_folder (string) : Path of the folder to save the extracted audio file
1059
  Returns:
1060
  - wav_file (string) : Path of the extracted audio file
1061
+ - msg (string) : Message to be returned
1062
  '''
1063
 
1064
  wav_file = os.path.join(result_folder, "audio.wav")
 
1082
  - video_sequences (array) : Array of video frames to be used as input to the model
1083
  - audio_sequences (array) : Array of audio frames to be used as input to the model
1084
  - model (object) : Model object
1085
+ - asd (bool) : Active speaker detection task flag to return the correct dimensions for the embeddings
1086
  - calc_aud_emb (bool) : Flag to calculate the audio embedding
1087
  Returns:
1088
  - video_emb (array) : Video embedding
1089
  - audio_emb (array) : Audio embedding
1090
  '''
1091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
  video_emb = []
1093
  audio_emb = []
1094
 
 
1099
  if not asd:
1100
  vid_emb = vid_emb.unsqueeze(-1)
1101
 
 
1102
  video_emb.extend(vid_emb.detach().cpu().numpy())
1103
 
1104
  if calc_aud_emb:
1105
  audio_inp = audio_sequences[i:i+batch_size, ]
1106
  aud_emb = model.forward_aud(audio_inp)
 
1107
  audio_emb.extend(aud_emb.detach().cpu().numpy())
1108
 
1109
  torch.cuda.empty_cache()
1110
 
 
1111
  video_emb = np.array(video_emb)
1112
  print("Video Embedding Shape: ", video_emb.shape)
1113
 
1114
  if calc_aud_emb:
 
1115
  audio_emb = np.array(audio_emb)
1116
  print("Audio Embedding Shape: ", audio_emb.shape)
1117
 
 
1130
  - all_video_embeddings (array) : Array of video embeddings of all speakers
1131
  - audio_embedding (array) : Audio embedding
1132
  - global_score (bool) : Flag to calculate the global score
1133
+ - num_avg_frames (int) : Number of frames to average the scores
1134
+ - model (object) : Model object
1135
  Returns:
1136
  - pred_speaker (list) : List of active speakers in each frame
1137
+ - num_avg_frames (int) : Number of frames to average the scores
1138
  '''
1139
 
1140
  cos = nn.CosineSimilarity(dim=1)
 
1188
  - result_folder (string) : Path of the result folder to save the output video
1189
  Returns:
1190
  - video_output (string) : Path of the output video
1191
+ - msg (string) : Message to be returned
1192
  '''
1193
 
1194
  try:
 
1217
 
1218
  def preprocess_asd(video_path, result_folder_input):
1219
 
1220
+ '''
1221
+ This function preprocesses the video for the active speaker detection task
1222
+
1223
+ Args:
1224
+ - video_path (string) : Path of the video file
1225
+ - result_folder_input (string) : Path of the folder to save the input video
1226
+ Returns:
1227
+ - msg (string) : Message to be returned
1228
+ '''
1229
+
1230
  file = video_path
1231
 
1232
  data_dir = os.path.join(result_folder_input, 'temp')
 
1252
 
1253
  def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
1254
 
1255
+ '''
1256
+ This function processes the video for the sync offset prediction task
1257
+
1258
+ Args:
1259
+ - video_path (string) : Path of the video file
1260
+ - num_avg_frames (int) : Number of frames to average the scores
1261
+ - apply_preprocess (bool) : Flag to apply the pre-processing steps or not
1262
+ Returns:
1263
+ - video_output (string) : Path of the output video
1264
+ - msg (string) : Message to be returned
1265
+ '''
1266
+
1267
  try:
1268
  # Extract the video filename
1269
  video_fname = os.path.basename(video_path.split(".")[0])
 
1316
  msg = "Error: The input video is too short. Please use a longer input video."
1317
  return None, msg
1318
 
 
1319
  # Load keypoints and check if gestures are visible
1320
  kp_dict, status = get_keypoints(frames)
1321
  if status != "success":
 
1325
  status = check_visible_gestures(kp_dict)
1326
  if status != "success":
1327
  return None, status
 
 
1328
 
1329
  # Load RGB frames
1330
  rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
 
1357
  # Extract embeddings
1358
  print("Obtaining audio and video embeddings...")
1359
  video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
 
 
 
1360
 
1361
  # L2 normalize embeddings
1362
  print("Normalizing embeddings")
 
1396
 
1397
 
1398
  def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1399
+
1400
+ '''
1401
+ This function processes the video for the active speaker detection task
1402
+
1403
+ Args:
1404
+ - video_path (string) : Path of the video file
1405
+ - global_speaker (string) : Flag to use global or per-frame predictions
1406
+ - num_avg_frames (int) : Number of frames to average the scores
1407
+ Returns:
1408
+ - video_output (string) : Path of the output video
1409
+ - msg (string) : Message to be returned
1410
+ '''
1411
+
1412
  try:
1413
  # Extract the video filename
1414
  video_fname = os.path.basename(video_path.split(".")[0])
 
1690
  <li>Input videos with clearly visible gestures work better.</li>
1691
  </ul>
1692
 
1693
+ Inference time:
1694
+ <ul>
1695
+ <li>Synchronization-correction: ~1 minute for a 10-second video</li>
1696
+ <li>Active-speaker-detection: ~2 minutes for a 10-second video</li>
1697
+ </ul>
1698
+ Note: Occasionally, there may be a delay in acquiring a GPU, as the model runs on a free community GPU from ZeroGPU.
1699
+
1700
  </div>
1701
  """
1702