Spaces:

sindhuhegde
/

gestsync

Running on Zero

App Files Files Community

sindhuhegde commited on Sep 1, 2024

Commit

4085332

1 Parent(s): 638c0b5

Update app

Browse files

Files changed (1) hide show

app.py +54 -46

app.py CHANGED Viewed

@@ -522,7 +522,9 @@ def resample_video(video_file, video_fname, result_folder):
 		- result_folder (string) : Path of the folder to save the resampled video
 	Returns:
 		- video_file_25fps (string) : Path of the resampled video file
 	'''
 	video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
 	# Resample the video to 25 fps
@@ -547,10 +549,6 @@ def load_checkpoint(path, model):
 	# Load the checkpoint
 	checkpoint = torch.load(path, map_location="cpu")
-	# if use_cuda:
-	# 	checkpoint = torch.load(path)
-	# else:
-	# 	checkpoint = torch.load(path, map_location="cpu")
 	s = checkpoint["state_dict"]
 	new_s = {}
@@ -559,8 +557,6 @@ def load_checkpoint(path, model):
 		new_s[k.replace('module.', '')] = v
 	model.load_state_dict(new_s)
-	# model.to(device)
 	print("Loaded checkpoint from: {}".format(path))
 	return model.eval()
@@ -688,6 +684,7 @@ def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_fr
 	Args:
 		- input_frames (list) : List of frames extracted from the video
 		- kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
 		- stride (int) : Stride to extract the frames
 		- window_frames (int) : Number of frames in each window that is given as input to the model
 		- width (int) : Width of the frames
@@ -745,10 +742,8 @@ def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_fr
 	input_frames = np.array(input_frames_masked) / 255.
 	if asd:
 		input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
-	# print("Input images full: ", input_frames.shape)      	# num_framesx270x480x3
 	input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
-	# print("Input images window: ", input_frames.shape)      	# Tx25x270x480x3
 	print("Successfully created masked input frames")
 	num_frames = input_frames.shape[0]
@@ -766,6 +761,7 @@ def load_spectrograms(wav_file, asd=False, num_frames=None, window_frames=25, st
 	Args:
 		- wav_file (string) : Path of the extracted audio file
 		- num_frames (int) : Number of frames to extract
 		- window_frames (int) : Number of frames in each window that is given as input to the model
 		- stride (int) : Stride to extract the audio frames
@@ -919,6 +915,7 @@ def generate_video(frames, audio_file, video_fname):
 		- video_fname (string) : Path of the video file
 	Returns:
 		- video_output (string) : Path of the video file
 	'''
 	fname = 'inference.avi'
@@ -962,6 +959,7 @@ def sync_correct_video(video_path, frames, wav_file, offset, result_folder, samp
 		- fps (int) : Frames per second of the video
 	Returns:
 		- video_output (string) : Path of the video file
 	'''
 	if offset == 0:
@@ -1060,6 +1058,7 @@ def extract_audio(video, result_folder):
 		- result_folder (string) : Path of the folder to save the extracted audio file
 	Returns:
 		- wav_file (string) : Path of the extracted audio file
 	'''
 	wav_file  = os.path.join(result_folder, "audio.wav")
@@ -1083,40 +1082,13 @@ def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_
 		- video_sequences (array) : Array of video frames to be used as input to the model
 		- audio_sequences (array) : Array of audio frames to be used as input to the model
 		- model (object) : Model object
 		- calc_aud_emb (bool) : Flag to calculate the audio embedding
 	Returns:
 		- video_emb (array) : Video embedding
 		- audio_emb (array) : Audio embedding
 	'''
-	# video_emb = []
-	# audio_emb = []
-	# # model = model.to(device)
-	# for i in tqdm(range(0, len(video_sequences), batch_size)):
-	# 	video_inp = video_sequences[i:i+batch_size, ]
-	# 	vid_emb = model.forward_vid(video_inp, return_feats=False)
-	# 	vid_emb = torch.mean(vid_emb, axis=-1)
-	# 	video_emb.append(vid_emb.detach().cpu())
-	# 	if calc_aud_emb:
-	# 		audio_inp = audio_sequences[i:i+batch_size, ]
-	# 		aud_emb = model.forward_aud(audio_inp)
-	# 		audio_emb.append(aud_emb.detach().cpu())
-	# 	# torch.cuda.empty_cache()
-	# print("Extracted embeddings: ", len(video_emb), len(audio_emb))
-	# if calc_aud_emb==True:
-	# 	print("returning audio and video embeddings...")
-	# 	return video_emb, audio_emb
-	# return video_emb
 	video_emb = []
 	audio_emb = []
@@ -1127,23 +1099,19 @@ def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_
 		if not asd:
 			vid_emb = vid_emb.unsqueeze(-1)
-		# video_emb.append(vid_emb.detach())
 		video_emb.extend(vid_emb.detach().cpu().numpy())
 		if calc_aud_emb:
 			audio_inp = audio_sequences[i:i+batch_size, ]
 			aud_emb = model.forward_aud(audio_inp)
-			# audio_emb.append(aud_emb.detach())
 			audio_emb.extend(aud_emb.detach().cpu().numpy())
 		torch.cuda.empty_cache()
-	# video_emb = torch.cat(video_emb, dim=0)
 	video_emb = np.array(video_emb)
 	print("Video Embedding Shape: ", video_emb.shape)
 	if calc_aud_emb:
-		# audio_emb = torch.cat(audio_emb, dim=0)
 		audio_emb = np.array(audio_emb)
 		print("Audio Embedding Shape: ", audio_emb.shape)
@@ -1162,8 +1130,11 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
 		- all_video_embeddings (array) : Array of video embeddings of all speakers
 		- audio_embedding (array) : Audio embedding
 		- global_score (bool) : Flag to calculate the global score
 	Returns:
 		- pred_speaker (list) : List of active speakers in each frame
 	'''
 	cos = nn.CosineSimilarity(dim=1)
@@ -1217,6 +1188,7 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
 		- result_folder (string) : Path of the result folder to save the output video
 	Returns:
 		- video_output (string) : Path of the output video
 	'''
 	try:
@@ -1245,6 +1217,16 @@ def save_video(output_tracks, input_frames, wav_file, result_folder):
 def preprocess_asd(video_path, result_folder_input):
 	file = video_path
 	data_dir = os.path.join(result_folder_input, 'temp')
@@ -1270,6 +1252,18 @@ def preprocess_asd(video_path, result_folder_input):
 def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 	try:
 		# Extract the video filename
 		video_fname = os.path.basename(video_path.split(".")[0])
@@ -1322,7 +1316,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 			msg = "Error: The input video is too short. Please use a longer input video."
 			return None, msg
-		# if apply_preprocess:
 		# Load keypoints and check if gestures are visible
 		kp_dict, status = get_keypoints(frames)
 		if status != "success":
@@ -1332,8 +1325,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		status = check_visible_gestures(kp_dict)
 		if status != "success":
 			return None, status
-		# else:
-		# 	kp_dict = None
 		# Load RGB frames
 		rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
@@ -1366,9 +1357,6 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		# Extract embeddings
 		print("Obtaining audio and video embeddings...")
 		video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
-		print("Obtained embeddings")
-		# video_emb = torch.cat(video_emb, dim=0)
-		# audio_emb = torch.cat(audio_emb, dim=0)
 		# L2 normalize embeddings
 		print("Normalizing embeddings")
@@ -1408,6 +1396,19 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 	try:
 		# Extract the video filename
 		video_fname = os.path.basename(video_path.split(".")[0])
@@ -1689,6 +1690,13 @@ if __name__ == "__main__":
 		<li>Input videos with clearly visible gestures work better.</li>
 	</ul>
 	</div>
 	"""

 		- result_folder (string) : Path of the folder to save the resampled video
 	Returns:
 		- video_file_25fps (string) : Path of the resampled video file
+		- msg (string) : Message to be returned
 	'''
 	video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
 	# Resample the video to 25 fps
 	# Load the checkpoint
 	checkpoint = torch.load(path, map_location="cpu")
 	s = checkpoint["state_dict"]
 	new_s = {}
 		new_s[k.replace('module.', '')] = v
 	model.load_state_dict(new_s)
 	print("Loaded checkpoint from: {}".format(path))
 	return model.eval()
 	Args:
 		- input_frames (list) : List of frames extracted from the video
 		- kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
+		- asd (bool) : Whether to use padding (needed for active speaker detection task) or not
 		- stride (int) : Stride to extract the frames
 		- window_frames (int) : Number of frames in each window that is given as input to the model
 		- width (int) : Width of the frames
 	input_frames = np.array(input_frames_masked) / 255.
 	if asd:
 		input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
 	input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
 	print("Successfully created masked input frames")
 	num_frames = input_frames.shape[0]
 	Args:
 		- wav_file (string) : Path of the extracted audio file
+		- asd (bool) : Whether to use padding (needed for active speaker detection task) or not
 		- num_frames (int) : Number of frames to extract
 		- window_frames (int) : Number of frames in each window that is given as input to the model
 		- stride (int) : Stride to extract the audio frames
 		- video_fname (string) : Path of the video file
 	Returns:
 		- video_output (string) : Path of the video file
+		- msg (string) : Message to be returned
 	'''
 	fname = 'inference.avi'
 		- fps (int) : Frames per second of the video
 	Returns:
 		- video_output (string) : Path of the video file
+		- msg (string) : Message to be returned
 	'''
 	if offset == 0:
 		- result_folder (string) : Path of the folder to save the extracted audio file
 	Returns:
 		- wav_file (string) : Path of the extracted audio file
+		- msg (string) : Message to be returned
 	'''
 	wav_file  = os.path.join(result_folder, "audio.wav")
 		- video_sequences (array) : Array of video frames to be used as input to the model
 		- audio_sequences (array) : Array of audio frames to be used as input to the model
 		- model (object) : Model object
+		- asd (bool) : Active speaker detection task flag to return the correct dimensions for the embeddings
 		- calc_aud_emb (bool) : Flag to calculate the audio embedding
 	Returns:
 		- video_emb (array) : Video embedding
 		- audio_emb (array) : Audio embedding
 	'''
 	video_emb = []
 	audio_emb = []
 		if not asd:
 			vid_emb = vid_emb.unsqueeze(-1)
 		video_emb.extend(vid_emb.detach().cpu().numpy())
 		if calc_aud_emb:
 			audio_inp = audio_sequences[i:i+batch_size, ]
 			aud_emb = model.forward_aud(audio_inp)
 			audio_emb.extend(aud_emb.detach().cpu().numpy())
 		torch.cuda.empty_cache()
 	video_emb = np.array(video_emb)
 	print("Video Embedding Shape: ", video_emb.shape)
 	if calc_aud_emb:
 		audio_emb = np.array(audio_emb)
 		print("Audio Embedding Shape: ", audio_emb.shape)
 		- all_video_embeddings (array) : Array of video embeddings of all speakers
 		- audio_embedding (array) : Audio embedding
 		- global_score (bool) : Flag to calculate the global score
+		- num_avg_frames (int) : Number of frames to average the scores
+		- model (object) : Model object
 	Returns:
 		- pred_speaker (list) : List of active speakers in each frame
+		- num_avg_frames (int) : Number of frames to average the scores
 	'''
 	cos = nn.CosineSimilarity(dim=1)
 		- result_folder (string) : Path of the result folder to save the output video
 	Returns:
 		- video_output (string) : Path of the output video
+		- msg (string) : Message to be returned
 	'''
 	try:
 def preprocess_asd(video_path, result_folder_input):
+	'''
+	This function preprocesses the video for the active speaker detection task
+	Args:
+		- video_path (string) : Path of the video file
+		- result_folder_input (string) : Path of the folder to save the input video
+	Returns:
+		- msg (string) : Message to be returned
+	'''
 	file = video_path
 	data_dir = os.path.join(result_folder_input, 'temp')
 def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
+	'''
+	This function processes the video for the sync offset prediction task
+	Args:
+		- video_path (string) : Path of the video file
+		- num_avg_frames (int) : Number of frames to average the scores
+		- apply_preprocess (bool) : Flag to apply the pre-processing steps or not
+	Returns:
+		- video_output (string) : Path of the output video
+		- msg (string) : Message to be returned
+	'''
 	try:
 		# Extract the video filename
 		video_fname = os.path.basename(video_path.split(".")[0])
 			msg = "Error: The input video is too short. Please use a longer input video."
 			return None, msg
 		# Load keypoints and check if gestures are visible
 		kp_dict, status = get_keypoints(frames)
 		if status != "success":
 		status = check_visible_gestures(kp_dict)
 		if status != "success":
 			return None, status
 		# Load RGB frames
 		rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
 		# Extract embeddings
 		print("Obtaining audio and video embeddings...")
 		video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)
 		# L2 normalize embeddings
 		print("Normalizing embeddings")
 def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
+	'''
+	This function processes the video for the active speaker detection task
+	Args:
+		- video_path (string) : Path of the video file
+		- global_speaker (string) : Flag to use global or per-frame predictions
+		- num_avg_frames (int) : Number of frames to average the scores
+	Returns:
+		- video_output (string) : Path of the output video
+		- msg (string) : Message to be returned
+	'''
 	try:
 		# Extract the video filename
 		video_fname = os.path.basename(video_path.split(".")[0])
 		<li>Input videos with clearly visible gestures work better.</li>
 	</ul>
+	Inference time:
+	<ul>
+		<li>Synchronization-correction: ~1 minute for a 10-second video</li>
+		<li>Active-speaker-detection: ~2 minutes for a 10-second video</li>
+	</ul>
+	Note: Occasionally, there may be a delay in acquiring a GPU, as the model runs on a free community GPU from ZeroGPU.
 	</div>
 	"""