Spaces:
Runtime error
Runtime error
import torch | |
import gradio as gr | |
from transformers import ViTImageProcessor, ViTModel | |
from audiodiffusion import AudioDiffusionPipeline, ImageEncoder | |
from pedalboard.io import AudioFile | |
from pedalboard import Pedalboard, NoiseGate, Compressor, LowShelfFilter, Gain, HighShelfFilter, Reverb | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
generator1 = torch.Generator(device) | |
generator2 = torch.Generator(device) | |
pipe = AudioDiffusionPipeline.from_pretrained('Woleek/clMusDiff').to(device) | |
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k') | |
extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k') | |
image_encoder = ImageEncoder(processor, extractor) | |
board = Pedalboard([ | |
NoiseGate(threshold_db=-60, ratio=10.0), | |
Compressor(threshold_db=60, ratio=1.0), | |
LowShelfFilter(cutoff_frequency_hz=220, gain_db=-10), | |
HighShelfFilter(cutoff_frequency_hz=1200, gain_db=-10), | |
Gain(gain_db=40), | |
Reverb(room_size=0.5), | |
]) | |
def _encode_image(image): | |
return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device) | |
def _generate_spectrogram(condition, steps, eta): | |
images, (sample_rate, audios) = pipe( | |
batch_size=1, | |
steps=steps, | |
generator=generator1, | |
step_generator=generator2, | |
encoding=condition, | |
eta=eta, | |
return_dict=False, | |
) | |
return images[0], (sample_rate, audios[0]) | |
def _denoise_audio(audio, sr): | |
return board(audio, sr) | |
def run_generation(image, steps, eta): | |
condition = _encode_image(image) | |
spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta) | |
audio = _denoise_audio(audio, sr) | |
return spectrogram, (sr, audio) | |
with gr.Blocks(title="Image-based soundtrack generation") as demo: | |
gr.Markdown(''' | |
# Image-based soundtrack generation | |
''') | |
with gr.Row(): | |
with gr.Column(): | |
image = gr.Image( | |
type="pil", | |
label="Conditioning image" | |
) | |
steps = gr.Slider( | |
minimum=10, | |
maximum=1000, | |
step=10, | |
value=50, | |
label="Denoising steps" | |
) | |
eta = gr.Slider( | |
minimum=0.0, | |
maximum=1.0, | |
step=0.1, | |
value=0.6, | |
label="η" | |
) | |
gr.Markdown(''' | |
Eta (η) is a variable that controls the level of interpolation between deterministic (η=0.0) and stochastic (η=1.0) denoising schedule. | |
''') | |
btn = gr.Button("Generate") | |
clear = gr.ClearButton(image) | |
with gr.Column(): | |
spectrogram = gr.Image( | |
label="Generated Mel spectrogram" | |
) | |
audio = gr.Audio( | |
label="Resulting audio" | |
) | |
btn.click(run_generation, inputs=[image, steps, eta], outputs=[spectrogram, audio]) | |
demo.launch() |