Spaces:
Paused
Paused
Synced repo using 'sync_with_huggingface' Github Action
Browse files- .gitattributes +1 -0
- app.py +362 -0
- nate_is_humming.wav +3 -0
- nate_is_singing_Gb_minor.wav +0 -0
- pitch_correction_utils.py +161 -0
- requirements.txt +6 -0
- singing_songstarter_demo.ipynb +78 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
nate_is_humming.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###########################################
|
2 |
+
# For fast downloads from Hugging Face Hub
|
3 |
+
# **Requires the hf_transfer package**
|
4 |
+
###########################################
|
5 |
+
import os
|
6 |
+
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
7 |
+
###########################################
|
8 |
+
|
9 |
+
import json
|
10 |
+
import random
|
11 |
+
import typing as tp
|
12 |
+
from datetime import datetime
|
13 |
+
from pathlib import Path
|
14 |
+
from functools import partial
|
15 |
+
|
16 |
+
import gradio as gr
|
17 |
+
import torch
|
18 |
+
import torchaudio
|
19 |
+
import numpy as np
|
20 |
+
|
21 |
+
from audiocraft.models import musicgen
|
22 |
+
from audiocraft.data.audio import audio_write
|
23 |
+
from audiocraft.utils.notebook import display_audio
|
24 |
+
|
25 |
+
from pitch_correction_utils import autotune, closest_pitch, aclosest_pitch_from_scale
|
26 |
+
|
27 |
+
|
28 |
+
def ta_to_librosa_format(waveform):
|
29 |
+
"""
|
30 |
+
Convert an audio tensor from torchaudio format to librosa format.
|
31 |
+
|
32 |
+
Args:
|
33 |
+
waveform (torch.Tensor): Audio tensor from torchaudio with shape (n_channels, n_samples).
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
np.ndarray: Audio array in librosa format with shape (n_samples,) or (2, n_samples).
|
37 |
+
"""
|
38 |
+
# Ensure waveform is in CPU and convert to numpy
|
39 |
+
waveform_np = waveform.numpy()
|
40 |
+
|
41 |
+
# Check if audio is mono or stereo and transpose if necessary
|
42 |
+
if waveform_np.shape[0] == 1:
|
43 |
+
# Remove the channel dimension for mono
|
44 |
+
waveform_np = waveform_np.squeeze(0)
|
45 |
+
else:
|
46 |
+
# Transpose to switch from (n_channels, n_samples) to (n_samples, n_channels)
|
47 |
+
waveform_np = waveform_np.transpose()
|
48 |
+
|
49 |
+
# Normalize to [-1, 1] if not already
|
50 |
+
if waveform_np.dtype in [np.int16, np.int32]:
|
51 |
+
waveform_np = waveform_np / np.iinfo(waveform_np.dtype).max
|
52 |
+
|
53 |
+
return waveform_np
|
54 |
+
|
55 |
+
|
56 |
+
def librosa_to_ta_format(waveform_np):
|
57 |
+
"""
|
58 |
+
Convert an audio array from librosa format to torchaudio format.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
waveform_np (np.ndarray): Audio array from librosa with shape (n_samples,) or (2, n_samples).
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
torch.Tensor: Audio tensor in torchaudio format with shape (n_channels, n_samples).
|
65 |
+
"""
|
66 |
+
# Ensure it is a float32 array normalized to [-1, 1]
|
67 |
+
waveform_np = np.array(waveform_np, dtype=np.float32)
|
68 |
+
|
69 |
+
if waveform_np.ndim == 1:
|
70 |
+
# Add a channel dimension for mono
|
71 |
+
waveform_np = waveform_np[np.newaxis, :]
|
72 |
+
else:
|
73 |
+
# Transpose to switch from (n_samples, n_channels) to (n_channels, n_samples)
|
74 |
+
waveform_np = waveform_np.transpose()
|
75 |
+
|
76 |
+
# Convert numpy array to PyTorch tensor
|
77 |
+
waveform = torch.from_numpy(waveform_np)
|
78 |
+
return waveform
|
79 |
+
|
80 |
+
|
81 |
+
def run_autotune(y, sr, correction_method="closest", scale=None):
|
82 |
+
# Only mono-files are handled. If stereo files are supplied, only the first channel is used.
|
83 |
+
if y.ndim > 1:
|
84 |
+
y = y[0, :]
|
85 |
+
|
86 |
+
# Pick the pitch adjustment strategy according to the arguments.
|
87 |
+
correction_function = closest_pitch if correction_method == 'closest' else \
|
88 |
+
partial(aclosest_pitch_from_scale, scale=scale)
|
89 |
+
|
90 |
+
# Torchaudio -> librosa
|
91 |
+
y = ta_to_librosa_format(y)
|
92 |
+
# Autotune
|
93 |
+
pitch_corrected_y = autotune(y, sr, correction_function, plot=False)
|
94 |
+
# Librosa -> torchaudio
|
95 |
+
pitch_corrected_y = librosa_to_ta_format(pitch_corrected_y)
|
96 |
+
|
97 |
+
return pitch_corrected_y
|
98 |
+
|
99 |
+
|
100 |
+
def set_all_seeds(seed):
|
101 |
+
random.seed(seed)
|
102 |
+
os.environ["PYTHONHASHSEED"] = str(seed)
|
103 |
+
np.random.seed(seed)
|
104 |
+
torch.manual_seed(seed)
|
105 |
+
torch.cuda.manual_seed(seed)
|
106 |
+
torch.backends.cudnn.deterministic = True
|
107 |
+
|
108 |
+
|
109 |
+
def _preprocess_audio(
|
110 |
+
audio_path, model: musicgen.MusicGen, duration: tp.Optional[int] = None
|
111 |
+
):
|
112 |
+
wav, sr = torchaudio.load(audio_path)
|
113 |
+
wav = torchaudio.functional.resample(wav, sr, model.sample_rate)
|
114 |
+
wav = wav.mean(dim=0, keepdim=True)
|
115 |
+
|
116 |
+
# Calculate duration in seconds if not provided
|
117 |
+
if duration is None:
|
118 |
+
duration = wav.shape[1] / model.sample_rate
|
119 |
+
|
120 |
+
# Check if duration is more than 30 seconds
|
121 |
+
if duration > 30:
|
122 |
+
raise ValueError("Duration cannot be more than 30 seconds")
|
123 |
+
|
124 |
+
end_sample = int(model.sample_rate * duration)
|
125 |
+
wav = wav[:, :end_sample]
|
126 |
+
|
127 |
+
assert wav.shape[0] == 1
|
128 |
+
assert wav.shape[1] == model.sample_rate * duration
|
129 |
+
|
130 |
+
wav = wav.cuda()
|
131 |
+
wav = wav.unsqueeze(1)
|
132 |
+
|
133 |
+
with torch.no_grad():
|
134 |
+
gen_audio = model.compression_model.encode(wav)
|
135 |
+
|
136 |
+
codes, scale = gen_audio
|
137 |
+
|
138 |
+
assert scale is None
|
139 |
+
|
140 |
+
return codes
|
141 |
+
|
142 |
+
|
143 |
+
def _get_stemmed_wav_patched(wav, sample_rate):
|
144 |
+
print("Skipping stem separation!")
|
145 |
+
return wav
|
146 |
+
|
147 |
+
|
148 |
+
class Pipeline:
|
149 |
+
def __init__(self, model_id, max_batch_size=4, do_skip_demucs=True):
|
150 |
+
self.model = musicgen.MusicGen.get_pretrained(model_id)
|
151 |
+
self.max_batch_size = max_batch_size
|
152 |
+
self.do_skip_demucs = do_skip_demucs
|
153 |
+
|
154 |
+
if self.do_skip_demucs:
|
155 |
+
self.model.lm.condition_provider.conditioners.self_wav._get_stemmed_wav = _get_stemmed_wav_patched
|
156 |
+
|
157 |
+
def __call__(
|
158 |
+
self,
|
159 |
+
prompt,
|
160 |
+
input_audio=None,
|
161 |
+
scale=None,
|
162 |
+
continuation=False,
|
163 |
+
batch_size=1,
|
164 |
+
duration=15,
|
165 |
+
use_sampling=True,
|
166 |
+
temperature=1.0,
|
167 |
+
top_k=250,
|
168 |
+
top_p=0.0,
|
169 |
+
cfg_coef=3.0,
|
170 |
+
output_dir="./samples", # change to google drive if you'd like
|
171 |
+
normalization_strategy="loudness",
|
172 |
+
seed=-1,
|
173 |
+
continuation_start=0,
|
174 |
+
continuation_end=None,
|
175 |
+
):
|
176 |
+
print("Prompt:", prompt)
|
177 |
+
if scale == "closest":
|
178 |
+
scale = None
|
179 |
+
|
180 |
+
set_generation_params = lambda duration: self.model.set_generation_params(
|
181 |
+
duration=duration,
|
182 |
+
top_k=top_k,
|
183 |
+
top_p=top_p,
|
184 |
+
temperature=temperature,
|
185 |
+
cfg_coef=cfg_coef,
|
186 |
+
)
|
187 |
+
|
188 |
+
if not seed or seed == -1:
|
189 |
+
seed = torch.seed() % 2 ** 32 - 1
|
190 |
+
set_all_seeds(seed)
|
191 |
+
set_all_seeds(seed)
|
192 |
+
print(f"Using seed {seed}")
|
193 |
+
if not input_audio:
|
194 |
+
set_generation_params(duration)
|
195 |
+
wav, tokens = self.model.generate([prompt] * batch_size, progress=True, return_tokens=True)
|
196 |
+
else:
|
197 |
+
input_audio, sr = torchaudio.load(input_audio)
|
198 |
+
# Save a copy of the original input audio
|
199 |
+
original_input_audio = input_audio.clone()
|
200 |
+
print("Input audio shape:", input_audio.shape)
|
201 |
+
if scale is None:
|
202 |
+
print("Running pitch correction for 'closest' pitch")
|
203 |
+
input_audio = run_autotune(input_audio, sr, correction_method="closest")
|
204 |
+
else:
|
205 |
+
print("Running pitch correction for 'scale' pitch")
|
206 |
+
input_audio = run_autotune(input_audio, sr, correction_method="scale", scale=scale)
|
207 |
+
print(f"...Done running pitch correction. Shape after is {input_audio.shape}.\n")
|
208 |
+
input_audio = input_audio[None] if input_audio.dim() == 2 else input_audio
|
209 |
+
|
210 |
+
continuation_start = 0 if not continuation_start else continuation_start
|
211 |
+
if continuation_end is None or continuation_end == -1:
|
212 |
+
continuation_end = input_audio.shape[2] / sr
|
213 |
+
|
214 |
+
if continuation_start > continuation_end:
|
215 |
+
raise ValueError(
|
216 |
+
"`continuation_start` must be less than or equal to `continuation_end`"
|
217 |
+
)
|
218 |
+
|
219 |
+
input_audio_wavform = input_audio[
|
220 |
+
..., int(sr * continuation_start) : int(sr * continuation_end)
|
221 |
+
]
|
222 |
+
input_audio_wavform = input_audio_wavform.repeat(batch_size, 1, 1)
|
223 |
+
# TODO - not using this - is that wrong??
|
224 |
+
input_audio_duration = input_audio_wavform.shape[-1] / sr
|
225 |
+
|
226 |
+
if continuation:
|
227 |
+
set_generation_params(duration) # + input_audio_duration) # SEE TODO above
|
228 |
+
print("Continuation wavform shape!", input_audio_wavform.shape)
|
229 |
+
wav, tokens = self.model.generate_continuation(
|
230 |
+
prompt=input_audio_wavform,
|
231 |
+
prompt_sample_rate=sr,
|
232 |
+
descriptions=[prompt] * batch_size,
|
233 |
+
progress=True,
|
234 |
+
return_tokens=True
|
235 |
+
)
|
236 |
+
else:
|
237 |
+
print("Melody wavform shape!", input_audio_wavform.shape)
|
238 |
+
set_generation_params(duration)
|
239 |
+
wav, tokens = self.model.generate_with_chroma(
|
240 |
+
[prompt] * batch_size, input_audio_wavform, sr, progress=True, return_tokens=True
|
241 |
+
)
|
242 |
+
wav, tokens = wav.cpu(), tokens.cpu()
|
243 |
+
# Write to files
|
244 |
+
output_dir = Path(output_dir)
|
245 |
+
output_dir.mkdir(exist_ok=True, parents=True)
|
246 |
+
dt_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
247 |
+
if input_audio is not None:
|
248 |
+
outfile_path = output_dir / f"{dt_str}_input_raw"
|
249 |
+
audio_write(
|
250 |
+
outfile_path,
|
251 |
+
original_input_audio,
|
252 |
+
sr,
|
253 |
+
strategy=normalization_strategy,
|
254 |
+
)
|
255 |
+
outfile_path = output_dir / f"{dt_str}_input_pitch_corrected"
|
256 |
+
audio_write(
|
257 |
+
outfile_path,
|
258 |
+
input_audio_wavform[0],
|
259 |
+
sr,
|
260 |
+
strategy=normalization_strategy,
|
261 |
+
)
|
262 |
+
|
263 |
+
for i in range(batch_size):
|
264 |
+
outfile_path = output_dir / f"{dt_str}_{i:02d}"
|
265 |
+
audio_write(
|
266 |
+
outfile_path,
|
267 |
+
wav[i],
|
268 |
+
self.model.sample_rate,
|
269 |
+
strategy=normalization_strategy,
|
270 |
+
)
|
271 |
+
json_out_path = output_dir / f"{dt_str}.json"
|
272 |
+
json_out_path.write_text(json.dumps(dict(
|
273 |
+
prompt=prompt,
|
274 |
+
batch_size=batch_size,
|
275 |
+
duration=duration,
|
276 |
+
use_sampling=use_sampling,
|
277 |
+
temperature=temperature,
|
278 |
+
top_k=top_k,
|
279 |
+
cfg_coef=cfg_coef,
|
280 |
+
)))
|
281 |
+
|
282 |
+
to_return = [None] * (self.max_batch_size + 1)
|
283 |
+
if input_audio is not None:
|
284 |
+
print(f"trying to return input audio wavform of shape: {input_audio_wavform.shape}")
|
285 |
+
to_return[0] = (sr, input_audio_wavform[0].T.numpy())
|
286 |
+
|
287 |
+
for i in range(batch_size):
|
288 |
+
to_return[i + 1] = (self.model.sample_rate, wav[i].T.numpy())
|
289 |
+
print(wav[i].shape)
|
290 |
+
return to_return
|
291 |
+
|
292 |
+
|
293 |
+
def main(model_id="nateraw/musicgen-songstarter-v0.2", max_batch_size=4, share=False, debug=False):
|
294 |
+
pipeline = Pipeline(model_id, max_batch_size)
|
295 |
+
interface = gr.Interface(
|
296 |
+
fn=pipeline.__call__,
|
297 |
+
inputs=[
|
298 |
+
gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
|
299 |
+
gr.Audio(
|
300 |
+
sources=["microphone"],
|
301 |
+
waveform_options=gr.WaveformOptions(
|
302 |
+
waveform_color="#01C6FF",
|
303 |
+
waveform_progress_color="#0066B4",
|
304 |
+
skip_length=2,
|
305 |
+
show_controls=False,
|
306 |
+
),
|
307 |
+
type="filepath",
|
308 |
+
),
|
309 |
+
gr.Dropdown(["closest", "A:maj", "A:min", "Bb:maj", "Bb:min", "B:maj", "B:min", "C:maj", "C:min", "Db:maj", "Db:min", "D:maj", "D:min", "Eb:maj", "Eb:min", "E:maj", "E:min", "F:maj", "F:min", "Gb:maj", "Gb:min", "G:maj", "G:min", "Ab:maj", "Ab:min"], label="Scale for pitch correction.", value="closest"),
|
310 |
+
gr.Checkbox(label="Is Continuation", value=False),
|
311 |
+
gr.Slider(label="Batch Size", value=1, minimum=1, maximum=pipeline.max_batch_size, step=1),
|
312 |
+
gr.Slider(label="Duration", value=15, minimum=4, maximum=30),
|
313 |
+
gr.Checkbox(label="Use Sampling", value=True),
|
314 |
+
gr.Slider(label="Temperature", value=1.0, minimum=0.0, maximum=2.0),
|
315 |
+
gr.Slider(label="Top K", value=250, minimum=0, maximum=1000),
|
316 |
+
gr.Slider(label="Top P", value=0.0, minimum=0.0, maximum=1.0),
|
317 |
+
gr.Slider(label="CFG Coef", value=3.0, minimum=0.0, maximum=10.0),
|
318 |
+
gr.Textbox(label="Output Dir", value="./samples"),
|
319 |
+
gr.Dropdown(["loudness", "clip", "peak", "rms"], value="loudness", label="Strategy for normalizing audio."),
|
320 |
+
gr.Slider(label="random seed", minimum=-1, maximum=9e8),
|
321 |
+
],
|
322 |
+
outputs=[gr.Audio(label=("Input " if i == 0 else "") + f"Audio {i}") for i in range(pipeline.max_batch_size + 1)],
|
323 |
+
title="🎶 Generate song ideas with musicgen-songstarter-v0.2 🎶",
|
324 |
+
description="Check out the repo [here](https://huggingface.co/nateraw/musicgen-songstarter-v0.2)",
|
325 |
+
examples=[
|
326 |
+
["hip hop, soul, piano, chords, jazz, neo jazz, G# minor, 140 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
327 |
+
["acoustic, guitar, melody, rnb, trap, E minor, 85 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
328 |
+
["synth, dark, hip hop, melody, trap, Gb minor, 140 bpm", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
329 |
+
["drill, layered, melody, songstarters, trap, C# minor, 130 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
330 |
+
["hip hop, soul, rnb, neo soul, songstarters, B minor, 140 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
331 |
+
["music, mallets, bells, melody, dancehall, african, afropop & afrobeats", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 4.5, "./samples", "loudness", -1],
|
332 |
+
]
|
333 |
+
)
|
334 |
+
interface.launch(share=share, debug=debug)
|
335 |
+
|
336 |
+
|
337 |
+
if __name__ == '__main__':
|
338 |
+
from fire import Fire
|
339 |
+
Fire(main)
|
340 |
+
|
341 |
+
# For testing
|
342 |
+
|
343 |
+
# pipe = Pipeline("nateraw/musicgen-songstarter-v0.2", max_batch_size=4)
|
344 |
+
# example_input = (
|
345 |
+
# "hip hop, soul, piano, chords, jazz, neo jazz, G# minor, 140 bpm",
|
346 |
+
# "nate_is_humming.wav",
|
347 |
+
# "closest",
|
348 |
+
# False,
|
349 |
+
# 1,
|
350 |
+
# 8,
|
351 |
+
# True,
|
352 |
+
# 1.0,
|
353 |
+
# 250,
|
354 |
+
# 0.0,
|
355 |
+
# 3.0,
|
356 |
+
# "./samples",
|
357 |
+
# "loudness",
|
358 |
+
# -1,
|
359 |
+
# 0,
|
360 |
+
# None
|
361 |
+
# )
|
362 |
+
# out = pipe(*example_input)
|
nate_is_humming.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a62520e3026bc71b06fa75a8120c3b46524a0a34dcac9661e3e27632e294b11f
|
3 |
+
size 1196036
|
nate_is_singing_Gb_minor.wav
ADDED
Binary file (619 kB). View file
|
|
pitch_correction_utils.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
+
from pathlib import Path
|
3 |
+
import argparse
|
4 |
+
import librosa
|
5 |
+
import librosa.display
|
6 |
+
import numpy as np
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import soundfile as sf
|
9 |
+
import scipy.signal as sig
|
10 |
+
import psola
|
11 |
+
|
12 |
+
|
13 |
+
SEMITONES_IN_OCTAVE = 12
|
14 |
+
|
15 |
+
|
16 |
+
def degrees_from(scale: str):
|
17 |
+
"""Return the pitch classes (degrees) that correspond to the given scale"""
|
18 |
+
degrees = librosa.key_to_degrees(scale)
|
19 |
+
# To properly perform pitch rounding to the nearest degree from the scale, we need to repeat
|
20 |
+
# the first degree raised by an octave. Otherwise, pitches slightly lower than the base degree
|
21 |
+
# would be incorrectly assigned.
|
22 |
+
degrees = np.concatenate((degrees, [degrees[0] + SEMITONES_IN_OCTAVE]))
|
23 |
+
return degrees
|
24 |
+
|
25 |
+
|
26 |
+
def closest_pitch(f0):
|
27 |
+
"""Round the given pitch values to the nearest MIDI note numbers"""
|
28 |
+
midi_note = np.around(librosa.hz_to_midi(f0))
|
29 |
+
# To preserve the nan values.
|
30 |
+
nan_indices = np.isnan(f0)
|
31 |
+
midi_note[nan_indices] = np.nan
|
32 |
+
# Convert back to Hz.
|
33 |
+
return librosa.midi_to_hz(midi_note)
|
34 |
+
|
35 |
+
|
36 |
+
def closest_pitch_from_scale(f0, scale):
|
37 |
+
"""Return the pitch closest to f0 that belongs to the given scale"""
|
38 |
+
# Preserve nan.
|
39 |
+
if np.isnan(f0):
|
40 |
+
return np.nan
|
41 |
+
degrees = degrees_from(scale)
|
42 |
+
midi_note = librosa.hz_to_midi(f0)
|
43 |
+
# Subtract the multiplicities of 12 so that we have the real-valued pitch class of the
|
44 |
+
# input pitch.
|
45 |
+
degree = midi_note % SEMITONES_IN_OCTAVE
|
46 |
+
# Find the closest pitch class from the scale.
|
47 |
+
degree_id = np.argmin(np.abs(degrees - degree))
|
48 |
+
# Calculate the difference between the input pitch class and the desired pitch class.
|
49 |
+
degree_difference = degree - degrees[degree_id]
|
50 |
+
# Shift the input MIDI note number by the calculated difference.
|
51 |
+
midi_note -= degree_difference
|
52 |
+
# Convert to Hz.
|
53 |
+
return librosa.midi_to_hz(midi_note)
|
54 |
+
|
55 |
+
|
56 |
+
def aclosest_pitch_from_scale(f0, scale):
|
57 |
+
"""Map each pitch in the f0 array to the closest pitch belonging to the given scale."""
|
58 |
+
sanitized_pitch = np.zeros_like(f0)
|
59 |
+
for i in np.arange(f0.shape[0]):
|
60 |
+
sanitized_pitch[i] = closest_pitch_from_scale(f0[i], scale)
|
61 |
+
# Perform median filtering to additionally smooth the corrected pitch.
|
62 |
+
smoothed_sanitized_pitch = sig.medfilt(sanitized_pitch, kernel_size=11)
|
63 |
+
# Remove the additional NaN values after median filtering.
|
64 |
+
smoothed_sanitized_pitch[np.isnan(smoothed_sanitized_pitch)] = \
|
65 |
+
sanitized_pitch[np.isnan(smoothed_sanitized_pitch)]
|
66 |
+
return smoothed_sanitized_pitch
|
67 |
+
|
68 |
+
|
69 |
+
def autotune(audio, sr, correction_function, plot=False):
|
70 |
+
# Set some basis parameters.
|
71 |
+
frame_length = 2048
|
72 |
+
hop_length = frame_length // 4
|
73 |
+
fmin = librosa.note_to_hz('C2')
|
74 |
+
fmax = librosa.note_to_hz('C7')
|
75 |
+
|
76 |
+
# Pitch tracking using the PYIN algorithm.
|
77 |
+
f0, voiced_flag, voiced_probabilities = librosa.pyin(audio,
|
78 |
+
frame_length=frame_length,
|
79 |
+
hop_length=hop_length,
|
80 |
+
sr=sr,
|
81 |
+
fmin=fmin,
|
82 |
+
fmax=fmax)
|
83 |
+
|
84 |
+
# Apply the chosen adjustment strategy to the pitch.
|
85 |
+
corrected_f0 = correction_function(f0)
|
86 |
+
|
87 |
+
if plot:
|
88 |
+
# Plot the spectrogram, overlaid with the original pitch trajectory and the adjusted
|
89 |
+
# pitch trajectory.
|
90 |
+
stft = librosa.stft(audio, n_fft=frame_length, hop_length=hop_length)
|
91 |
+
time_points = librosa.times_like(stft, sr=sr, hop_length=hop_length)
|
92 |
+
log_stft = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
|
93 |
+
fig, ax = plt.subplots()
|
94 |
+
img = librosa.display.specshow(log_stft, x_axis='time', y_axis='log', ax=ax, sr=sr, hop_length=hop_length, fmin=fmin, fmax=fmax)
|
95 |
+
fig.colorbar(img, ax=ax, format="%+2.f dB")
|
96 |
+
ax.plot(time_points, f0, label='original pitch', color='cyan', linewidth=2)
|
97 |
+
ax.plot(time_points, corrected_f0, label='corrected pitch', color='orange', linewidth=1)
|
98 |
+
ax.legend(loc='upper right')
|
99 |
+
plt.ylabel('Frequency [Hz]')
|
100 |
+
plt.xlabel('Time [M:SS]')
|
101 |
+
plt.savefig('pitch_correction.png', dpi=300, bbox_inches='tight')
|
102 |
+
|
103 |
+
# Pitch-shifting using the PSOLA algorithm.
|
104 |
+
return psola.vocode(audio, sample_rate=int(sr), target_pitch=corrected_f0, fmin=fmin, fmax=fmax)
|
105 |
+
|
106 |
+
|
107 |
+
def main(
|
108 |
+
vocals_file,
|
109 |
+
plot=False,
|
110 |
+
correction_method="closest",
|
111 |
+
scale=None
|
112 |
+
):
|
113 |
+
"""Run autotune-like pitch correction on the given audio file.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
vocals_file (str): Filepath to the audio file to be pitch-corrected.
|
117 |
+
plot (bool, optional): Whether to plot the results. Defaults to False.
|
118 |
+
correction_method (str, optional): The pitch correction method to use. Defaults to `"closest"`. If set to "closest", the pitch will be rounded to the nearest MIDI note.
|
119 |
+
If set to "scale", the pitch will be rounded to the nearest note in the given `scale`.
|
120 |
+
scale (str, optional): The scale to use for pitch correction. ex. `"C:min"` / `"A:maj"`. Defaults to None.
|
121 |
+
"""
|
122 |
+
|
123 |
+
# Parse the command line arguments.
|
124 |
+
# ap = argparse.ArgumentParser()
|
125 |
+
# ap.add_argument('vocals_file')
|
126 |
+
# ap.add_argument('--plot', '-p', action='store_true', default=False,
|
127 |
+
# help='if set, will produce a plot of the results')
|
128 |
+
# ap.add_argument('--correction-method', '-c', choices=['closest', 'scale'], default='closest')
|
129 |
+
# ap.add_argument('--scale', '-s', type=str, help='see librosa.key_to_degrees;'
|
130 |
+
# ' used only for the \"scale\" correction'
|
131 |
+
# ' method')
|
132 |
+
# args = ap.parse_args(args=args)
|
133 |
+
|
134 |
+
filepath = Path(vocals_file)
|
135 |
+
|
136 |
+
# Load the audio file.
|
137 |
+
y, sr = librosa.load(str(filepath), sr=None, mono=False)
|
138 |
+
|
139 |
+
# Only mono-files are handled. If stereo files are supplied, only the first channel is used.
|
140 |
+
if y.ndim > 1:
|
141 |
+
y = y[0, :]
|
142 |
+
|
143 |
+
# Pick the pitch adjustment strategy according to the arguments.
|
144 |
+
correction_function = closest_pitch if correction_method == 'closest' else \
|
145 |
+
partial(aclosest_pitch_from_scale, scale=scale)
|
146 |
+
|
147 |
+
# Perform the auto-tuning.
|
148 |
+
pitch_corrected_y = autotune(y, sr, correction_function, plot)
|
149 |
+
|
150 |
+
# Write the corrected audio to an output file.
|
151 |
+
filepath = filepath.parent / (filepath.stem + '_pitch_corrected' + filepath.suffix)
|
152 |
+
sf.write(str(filepath), pitch_corrected_y, sr)
|
153 |
+
return pitch_corrected_y
|
154 |
+
|
155 |
+
|
156 |
+
if __name__=='__main__':
|
157 |
+
# main("./singing_music_idea.wav --plot -c closest".split())
|
158 |
+
# python pitch_correction_utils.py --vocals_file "./nate_is_humming.wav" --plot -c closest
|
159 |
+
from fire import Fire
|
160 |
+
Fire(main)
|
161 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://[email protected]/facebookresearch/audiocraft#egg=audiocraft
|
2 |
+
hf_transfer
|
3 |
+
gradio
|
4 |
+
psola
|
5 |
+
torchvision==0.16.0
|
6 |
+
fire
|
singing_songstarter_demo.ipynb
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"nbformat": 4,
|
3 |
+
"nbformat_minor": 0,
|
4 |
+
"metadata": {
|
5 |
+
"colab": {
|
6 |
+
"provenance": [],
|
7 |
+
"machine_shape": "hm",
|
8 |
+
"gpuType": "A100",
|
9 |
+
"authorship_tag": "ABX9TyMm+2HEY3Dh8UBT+NJ/CIoa",
|
10 |
+
"include_colab_link": true
|
11 |
+
},
|
12 |
+
"kernelspec": {
|
13 |
+
"name": "python3",
|
14 |
+
"display_name": "Python 3"
|
15 |
+
},
|
16 |
+
"language_info": {
|
17 |
+
"name": "python"
|
18 |
+
},
|
19 |
+
"accelerator": "GPU"
|
20 |
+
},
|
21 |
+
"cells": [
|
22 |
+
{
|
23 |
+
"cell_type": "markdown",
|
24 |
+
"metadata": {
|
25 |
+
"id": "view-in-github",
|
26 |
+
"colab_type": "text"
|
27 |
+
},
|
28 |
+
"source": [
|
29 |
+
"<a href=\"https://colab.research.google.com/github/nateraw/singing-songstarter/blob/main/singing_songstarter_demo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "markdown",
|
34 |
+
"source": [
|
35 |
+
"# Singing Songstarter Demo\n",
|
36 |
+
"\n",
|
37 |
+
"This is a demo of using [`musicgen-songstarter-v0.2`](https://hf.co/nateraw/musicgen-songstarter-v0.2), a large stereo musicgen trained to be useful for music producers, for the task of voice-to-music.\n",
|
38 |
+
"\n",
|
39 |
+
"**Hum an idea, get a music sample!** 🚀\n",
|
40 |
+
"\n",
|
41 |
+
"### Usage\n",
|
42 |
+
"\n",
|
43 |
+
"1. Run the cell below.\n",
|
44 |
+
"\n",
|
45 |
+
"2. You can ignore \"restart this runtime\" message when it pops up\n",
|
46 |
+
"3. Click the public share link. Should look like: `\"Running on public URL: https://<your-link-here>\"`\n",
|
47 |
+
"4. Enjoy 🔥\n",
|
48 |
+
"\n",
|
49 |
+
"\n",
|
50 |
+
"### If you think this notebook is cool, consider supporting me by:\n",
|
51 |
+
" - giving [the model](https://hf.co/nateraw/musicgen-songstarter-v0.2) a heart on Hugging Face ❤️\n",
|
52 |
+
" - following me on [GitHub](https://github.com/nateraw) 👨💻\n",
|
53 |
+
" - following me on [X/twitter](https://twitter.com/nateraw) X\n",
|
54 |
+
" - giving [the demo repo](https://github.com/nateraw/singing-songstarter) a star ⭐️\n",
|
55 |
+
"\n",
|
56 |
+
"If you have any questions/concerns about this demo, please [file an issue on GitHub](https://github.com/nateraw/singing-songstarter)."
|
57 |
+
],
|
58 |
+
"metadata": {
|
59 |
+
"id": "hBsE8AuVsgG8"
|
60 |
+
}
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "code",
|
64 |
+
"execution_count": null,
|
65 |
+
"metadata": {
|
66 |
+
"id": "-fw0bpXysAUG"
|
67 |
+
},
|
68 |
+
"outputs": [],
|
69 |
+
"source": [
|
70 |
+
"%cd /content\n",
|
71 |
+
"! git clone https://github.com/nateraw/singing-songstarter\n",
|
72 |
+
"%cd /content/singing-songstarter\n",
|
73 |
+
"! pip install -r requirements.txt\n",
|
74 |
+
"! python app.py --share --debug"
|
75 |
+
]
|
76 |
+
}
|
77 |
+
]
|
78 |
+
}
|