Ahsen Khaliq
Update app.py
7033345
import torch
from transformers import Wav2Vec2Processor, HubertForCTC
import gradio as gr
from moviepy.editor import *
import cv2
import librosa
def get_optimal_font_scale(text, width):
for scale in reversed(range(0, 60, 1)):
textSize = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_DUPLEX, fontScale=scale/10, thickness=1)
new_width = textSize[0][0]
print(new_width)
if (new_width <= width):
return scale/10
return 1
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-xlarge-ls960-ft")
model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft")
def inference(audio, image):
y, sr = librosa.load(audio.name,sr=16000)
input_values = processor(y, return_tensors="pt").input_values # Batch size 1
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
audio_clip = AudioFileClip(audio.name)
image_clip = ImageClip(image.name).set_duration(audio_clip.duration)
image_clip = image_clip.resize(height=360) # make the height 360px ( According to moviePy documenation The width is then computed so that the width/height ratio is conserved.)
image_clip.write_videofile("my_video.mp4", fps=len(transcription.split())/audio_clip.duration)
videoclip = VideoFileClip("my_video.mp4")
new_audioclip = CompositeAudioClip([audio_clip])
videoclip.audio = new_audioclip
videoclip.write_videofile("new_filename.mp4")
frames = {k + 1: v.strip() for k, v in enumerate(transcription.split())}
cap = cv2.VideoCapture('new_filename.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
font = cv2.FONT_HERSHEY_SIMPLEX
frame_list = []
while cap.isOpened():
ret, frame = cap.read()
if ret:
frame_no = cap.get(cv2.CAP_PROP_POS_FRAMES)
if frame_no in frames:
fontScale = get_optimal_font_scale(frames[frame_no], w - 20)
print(frames[frame_no], (10, int(h)//2), font,
fontScale,
(0, 0, 0), 2, cv2.LINE_AA)
cv2.putText(frame, frames[frame_no], (10, int(h)//2), font,
fontScale,
(0, 0, 0), 2, cv2.LINE_AA)
frame_list.append(frame)
else:
break
output_clip = ImageSequenceClip(frame_list, fps=len(transcription.split())/audio_clip.duration)
output_clip.audio = new_audioclip
output_clip.write_videofile("output6.mp4")
return transcription, 'output6.mp4'
title = "Hubert-xlarge-ls960-ft"
description = "Gradio demo for hubert-xlarge-ls960-ft. To use it, simply add your audio file and image, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.07447'>HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units</a> | <a href='https://github.com/pytorch/fairseq/tree/main/examples/hubert'>Github Repo</a></p>"
examples = [['sample.wav','example.jpeg']]
gr.Interface(
inference,
[gr.inputs.Audio(type='file'),gr.inputs.Image(type="file", label="Input")],
[gr.outputs.Textbox(label="Output"),gr.outputs.Video(label="Video Out")],
title=title,
description=description,
article=article,
enable_queue=True,
examples=examples
).launch(debug=True)