File size: 3,139 Bytes
692312c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from typing import Dict, List, Any
from datasets import load_dataset
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch, numpy as np
import io
import soundfile as sf

from audiocraft.models import MusicGen

import yaml
import math
import torchaudio
import torch
from audiocraft.utils.notebook import display_audio

def get_bip_bip(
	bip_duration=0.125, frequency=440, duration=0.5, sample_rate=32000, device="cuda"):
    """Generates a series of bip bip at the given frequency."""
    t = torch.arange(
        int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate
    wav = torch.cos(2 * math.pi * 440 * t)[None]
    tp = (t % (2 * bip_duration)) / (2 * bip_duration)
    envelope = (tp >= 0.5).float()
    return wav * envelope

def load_conf(conf):
  with open(conf,'r') as f:
    conf= yaml.safeload(f)
  return conf
  
class generator:
	def __init__(self, conf_file):
		"""
		conf{
			model
			sampling_rate
		}
		"""
		self.conf = load_conf(conf_file)
		self.processor = AutoProcessor.from_pretrained(self.conf['model'])
		self.model = MusicGen.get_pretrained(self.conf['model'])
		self.model.set_generation_params(
			use_sampling=True,
			top_k=250,
			duration=self.conf['duration']
		)
		device = "cuda" if torch.cuda.is_available() else "cpu"
		self.model.to(device)
		self.sampling_rate = self.model.config.audio_encoder.sampling_rate
	
	def preprocess(self, text, audio):
		audio = audio[: int(len(audio) // self.conf['nth_slice_prompt'])]

	def generate(self, text:list, audio: np.array, **kwargs):
		"""
		text: ["modern melodic electronic dance music", "80s blues track with groovy saxophone"]
		audio (np.array)
		"""
		# inputs = self.processor(
		# 	audio=audio,
		# 	sampling_rate=self.conf["sampling_rate"],
		# 	text=text,
		# 	padding=True,
		# 	return_tensors="pt",
		# )
		output = self.model.generate_with_chroma(
			descriptions=[
				text
			],
			melody_wavs=audio,
			melody_sample_rate=self.conf['sampling_rate'],
			progress=True
		)
		return output
		

class EndpointHandler:
	def __init__(self, path=""):
		# load model and processor from path
		self.processor = AutoProcessor.from_pretrained(path)
		self.model = MusicgenForConditionalGeneration.from_pretrained(
			path, torch_dtype=torch.float16).to("cuda")
		self.generator = generator('conf.yaml')

	def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
		"""
		Args:
				data (:dict:):
						The payload with the text prompt and generation parameters.
		"""
		prompt_duration = 2
		# process input
		text = data.pop("text", data)
		audio = data.pop("audio", data)
		parameters = data.pop("parameters", None)
		audio, sr = sf.read(io.BytesIO(audio))
		output = self.generate(text, audio, sr)
		
		# # pass inputs with all kwargs in data
		# if parameters is not None:
		# 	with torch.autocast("cuda"):
		# 			outputs = self.model.generate(**inputs, **parameters)
		# else:
		# 	with torch.autocast("cuda"):
		# 			outputs = self.model.generate(**inputs,)

		# postprocess the prediction
		prediction = output.squeeze().cpu().numpy().tolist()

		return [{"generated_audio": prediction}]