Spaces:
Runtime error
Runtime error
from typing import Dict, List, Any | |
from datasets import load_dataset | |
from transformers import AutoProcessor, MusicgenForConditionalGeneration | |
import torch, numpy as np | |
import io | |
import soundfile as sf | |
from audiocraft.models import MusicGen | |
import yaml | |
import math | |
import torchaudio | |
import torch | |
from audiocraft.utils.notebook import display_audio | |
def get_bip_bip( | |
bip_duration=0.125, frequency=440, duration=0.5, sample_rate=32000, device="cuda"): | |
"""Generates a series of bip bip at the given frequency.""" | |
t = torch.arange( | |
int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate | |
wav = torch.cos(2 * math.pi * 440 * t)[None] | |
tp = (t % (2 * bip_duration)) / (2 * bip_duration) | |
envelope = (tp >= 0.5).float() | |
return wav * envelope | |
def load_conf(conf): | |
with open(conf,'r') as f: | |
conf= yaml.safeload(f) | |
return conf | |
class generator: | |
def __init__(self, conf_file): | |
""" | |
conf{ | |
model | |
sampling_rate | |
} | |
""" | |
self.conf = load_conf(conf_file) | |
self.processor = AutoProcessor.from_pretrained(self.conf['model']) | |
self.model = MusicGen.get_pretrained(self.conf['model']) | |
self.model.set_generation_params( | |
use_sampling=True, | |
top_k=250, | |
duration=self.conf['duration'] | |
) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
self.model.to(device) | |
self.sampling_rate = self.model.config.audio_encoder.sampling_rate | |
def preprocess(self, text, audio): | |
audio = audio[: int(len(audio) // self.conf['nth_slice_prompt'])] | |
def generate(self, text:list, audio: np.array, **kwargs): | |
""" | |
text: ["modern melodic electronic dance music", "80s blues track with groovy saxophone"] | |
audio (np.array) | |
""" | |
# inputs = self.processor( | |
# audio=audio, | |
# sampling_rate=self.conf["sampling_rate"], | |
# text=text, | |
# padding=True, | |
# return_tensors="pt", | |
# ) | |
output = self.model.generate_with_chroma( | |
descriptions=[ | |
text | |
], | |
melody_wavs=audio, | |
melody_sample_rate=self.conf['sampling_rate'], | |
progress=True | |
) | |
return output | |
class EndpointHandler: | |
def __init__(self, path=""): | |
# load model and processor from path | |
self.processor = AutoProcessor.from_pretrained(path) | |
self.model = MusicgenForConditionalGeneration.from_pretrained( | |
path, torch_dtype=torch.float16).to("cuda") | |
self.generator = generator('conf.yaml') | |
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]: | |
""" | |
Args: | |
data (:dict:): | |
The payload with the text prompt and generation parameters. | |
""" | |
prompt_duration = 2 | |
# process input | |
text = data.pop("text", data) | |
audio = data.pop("audio", data) | |
parameters = data.pop("parameters", None) | |
audio, sr = sf.read(io.BytesIO(audio)) | |
output = self.generate(text, audio, sr) | |
# # pass inputs with all kwargs in data | |
# if parameters is not None: | |
# with torch.autocast("cuda"): | |
# outputs = self.model.generate(**inputs, **parameters) | |
# else: | |
# with torch.autocast("cuda"): | |
# outputs = self.model.generate(**inputs,) | |
# postprocess the prediction | |
prediction = output.squeeze().cpu().numpy().tolist() | |
return [{"generated_audio": prediction}] | |