doodle-musegen / handler.py
supermomo668
hanlder
692312c
from typing import Dict, List, Any
from datasets import load_dataset
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch, numpy as np
import io
import soundfile as sf
from audiocraft.models import MusicGen
import yaml
import math
import torchaudio
import torch
from audiocraft.utils.notebook import display_audio
def get_bip_bip(
bip_duration=0.125, frequency=440, duration=0.5, sample_rate=32000, device="cuda"):
"""Generates a series of bip bip at the given frequency."""
t = torch.arange(
int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate
wav = torch.cos(2 * math.pi * 440 * t)[None]
tp = (t % (2 * bip_duration)) / (2 * bip_duration)
envelope = (tp >= 0.5).float()
return wav * envelope
def load_conf(conf):
with open(conf,'r') as f:
conf= yaml.safeload(f)
return conf
class generator:
def __init__(self, conf_file):
"""
conf{
model
sampling_rate
}
"""
self.conf = load_conf(conf_file)
self.processor = AutoProcessor.from_pretrained(self.conf['model'])
self.model = MusicGen.get_pretrained(self.conf['model'])
self.model.set_generation_params(
use_sampling=True,
top_k=250,
duration=self.conf['duration']
)
device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(device)
self.sampling_rate = self.model.config.audio_encoder.sampling_rate
def preprocess(self, text, audio):
audio = audio[: int(len(audio) // self.conf['nth_slice_prompt'])]
def generate(self, text:list, audio: np.array, **kwargs):
"""
text: ["modern melodic electronic dance music", "80s blues track with groovy saxophone"]
audio (np.array)
"""
# inputs = self.processor(
# audio=audio,
# sampling_rate=self.conf["sampling_rate"],
# text=text,
# padding=True,
# return_tensors="pt",
# )
output = self.model.generate_with_chroma(
descriptions=[
text
],
melody_wavs=audio,
melody_sample_rate=self.conf['sampling_rate'],
progress=True
)
return output
class EndpointHandler:
def __init__(self, path=""):
# load model and processor from path
self.processor = AutoProcessor.from_pretrained(path)
self.model = MusicgenForConditionalGeneration.from_pretrained(
path, torch_dtype=torch.float16).to("cuda")
self.generator = generator('conf.yaml')
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
"""
Args:
data (:dict:):
The payload with the text prompt and generation parameters.
"""
prompt_duration = 2
# process input
text = data.pop("text", data)
audio = data.pop("audio", data)
parameters = data.pop("parameters", None)
audio, sr = sf.read(io.BytesIO(audio))
output = self.generate(text, audio, sr)
# # pass inputs with all kwargs in data
# if parameters is not None:
# with torch.autocast("cuda"):
# outputs = self.model.generate(**inputs, **parameters)
# else:
# with torch.autocast("cuda"):
# outputs = self.model.generate(**inputs,)
# postprocess the prediction
prediction = output.squeeze().cpu().numpy().tolist()
return [{"generated_audio": prediction}]