Spaces:
Runtime error
Runtime error
import torch | |
from transformers import Wav2Vec2Processor, HubertForCTC | |
import gradio as gr | |
from moviepy.editor import * | |
import cv2 | |
import librosa | |
def get_optimal_font_scale(text, width): | |
for scale in reversed(range(0, 60, 1)): | |
textSize = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_DUPLEX, fontScale=scale/10, thickness=1) | |
new_width = textSize[0][0] | |
print(new_width) | |
if (new_width <= width): | |
return scale/10 | |
return 1 | |
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-xlarge-ls960-ft") | |
model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft") | |
def inference(audio, image): | |
y, sr = librosa.load(audio.name,sr=16000) | |
input_values = processor(y, return_tensors="pt").input_values # Batch size 1 | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.decode(predicted_ids[0]) | |
audio_clip = AudioFileClip(audio.name) | |
image_clip = ImageClip(image.name).set_duration(audio_clip.duration) | |
image_clip = image_clip.resize(height=360) # make the height 360px ( According to moviePy documenation The width is then computed so that the width/height ratio is conserved.) | |
image_clip.write_videofile("my_video.mp4", fps=len(transcription.split())/audio_clip.duration) | |
videoclip = VideoFileClip("my_video.mp4") | |
new_audioclip = CompositeAudioClip([audio_clip]) | |
videoclip.audio = new_audioclip | |
videoclip.write_videofile("new_filename.mp4") | |
frames = {k + 1: v.strip() for k, v in enumerate(transcription.split())} | |
cap = cv2.VideoCapture('new_filename.mp4') | |
fps = cap.get(cv2.CAP_PROP_FPS) | |
w = cap.get(cv2.CAP_PROP_FRAME_WIDTH) | |
h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) | |
fourcc = cv2.VideoWriter_fourcc(*'XVID') | |
font = cv2.FONT_HERSHEY_SIMPLEX | |
frame_list = [] | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if ret: | |
frame_no = cap.get(cv2.CAP_PROP_POS_FRAMES) | |
if frame_no in frames: | |
fontScale = get_optimal_font_scale(frames[frame_no], w - 20) | |
print(frames[frame_no], (10, int(h)//2), font, | |
fontScale, | |
(0, 0, 0), 2, cv2.LINE_AA) | |
cv2.putText(frame, frames[frame_no], (10, int(h)//2), font, | |
fontScale, | |
(0, 0, 0), 2, cv2.LINE_AA) | |
frame_list.append(frame) | |
else: | |
break | |
output_clip = ImageSequenceClip(frame_list, fps=len(transcription.split())/audio_clip.duration) | |
output_clip.audio = new_audioclip | |
output_clip.write_videofile("output6.mp4") | |
return transcription, 'output6.mp4' | |
title = "Hubert-xlarge-ls960-ft" | |
description = "Gradio demo for hubert-xlarge-ls960-ft. To use it, simply add your audio file and image, or click one of the examples to load them. Read more at the links below." | |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.07447'>HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units</a> | <a href='https://github.com/pytorch/fairseq/tree/main/examples/hubert'>Github Repo</a></p>" | |
examples = [['sample.wav','example.jpeg']] | |
gr.Interface( | |
inference, | |
[gr.inputs.Audio(type='file'),gr.inputs.Image(type="file", label="Input")], | |
[gr.outputs.Textbox(label="Output"),gr.outputs.Video(label="Video Out")], | |
title=title, | |
description=description, | |
article=article, | |
enable_queue=True, | |
examples=examples | |
).launch(debug=True) |