Ahsen Khaliq commited on
Commit
2410583
·
1 Parent(s): 3104d49

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import Wav2Vec2Processor, HubertForCTC
3
+ import soundfile as sf
4
+ import gradio as gr
5
+ from moviepy.editor import *
6
+ import cv2
7
+
8
+ def get_optimal_font_scale(text, width):
9
+ for scale in reversed(range(0, 60, 1)):
10
+ textSize = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_DUPLEX, fontScale=scale/10, thickness=1)
11
+ new_width = textSize[0][0]
12
+ print(new_width)
13
+ if (new_width <= width):
14
+ return scale/10
15
+ return 1
16
+
17
+ processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
18
+ model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
19
+ def map_to_array(file):
20
+ speech, _ = sf.read(file)
21
+ return speech
22
+ def inference(audio, image):
23
+ input_values = processor(map_to_array(audio.name), return_tensors="pt").input_values # Batch size 1
24
+ logits = model(input_values).logits
25
+ predicted_ids = torch.argmax(logits, dim=-1)
26
+ transcription = processor.decode(predicted_ids[0])
27
+ audio_clip = AudioFileClip(audio.name)
28
+ image_clip = ImageClip(image.name).set_duration(audio_clip.duration)
29
+ image_clip.write_videofile("my_video.mp4", fps=len(transcription.split())/audio_clip.duration)
30
+ videoclip = VideoFileClip("my_video.mp4")
31
+
32
+ new_audioclip = CompositeAudioClip([audio_clip])
33
+ videoclip.audio = new_audioclip
34
+ videoclip.write_videofile("new_filename.mp4")
35
+
36
+ frames = {k + 1: v.strip() for k, v in enumerate(transcription.split())}
37
+
38
+ cap = cv2.VideoCapture('new_filename.mp4')
39
+ fps = cap.get(cv2.CAP_PROP_FPS)
40
+ w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
41
+ h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
42
+
43
+ fourcc = cv2.VideoWriter_fourcc(*'XVID')
44
+ font = cv2.FONT_HERSHEY_SIMPLEX
45
+
46
+ frame_list = []
47
+
48
+ while cap.isOpened():
49
+ ret, frame = cap.read()
50
+ if ret:
51
+ frame_no = cap.get(cv2.CAP_PROP_POS_FRAMES)
52
+ if frame_no in frames:
53
+ fontScale = get_optimal_font_scale(frames[frame_no], w - 20)
54
+ print(frames[frame_no], (10, int(h)//2), font,
55
+ fontScale,
56
+ (0, 0, 0), 2, cv2.LINE_AA)
57
+ cv2.putText(frame, frames[frame_no], (10, int(h)//2), font,
58
+ fontScale,
59
+ (0, 0, 0), 2, cv2.LINE_AA)
60
+ frame_list.append(frame)
61
+ else:
62
+ break
63
+
64
+ output_clip = ImageSequenceClip(frame_list, fps=len(transcription.split())/audio_clip.duration)
65
+ output_clip.audio = new_audioclip
66
+ output_clip.write_videofile("output6.mp4")
67
+ cap.release()
68
+ cv2.destroyAllWindows()
69
+ return transcription, 'output6.mp4'
70
+
71
+ title = "Hubert"
72
+ description = "Gradio demo for Hubert. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below."
73
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.07447'>HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units</a> | <a href='https://github.com/pytorch/fairseq/tree/main/examples/hubert'>Github Repo</a></p>"
74
+
75
+ gr.Interface(
76
+ inference,
77
+ [gr.inputs.Audio(type='file'),gr.inputs.Image(type="file", label="Input")],
78
+ [gr.outputs.Textbox(label="Output"),gr.outputs.Video(label="Video Out")],
79
+ title=title,
80
+ description=description,
81
+ article=article,
82
+ enable_queue=True
83
+ ).launch(debug=True)