Spaces:
Running
Running
Update src/infer_pack/predictor/RMVPE.py
Browse files
src/infer_pack/predictor/RMVPE.py
CHANGED
@@ -334,28 +334,23 @@ class RMVPE:
|
|
334 |
ckpt = torch.load(model_path, map_location="cpu")
|
335 |
model.load_state_dict(ckpt)
|
336 |
model.eval()
|
337 |
-
if is_half
|
338 |
model = model.half()
|
339 |
self.model = model
|
340 |
-
self.resample_kernel = {}
|
341 |
self.is_half = is_half
|
342 |
-
if device
|
343 |
-
|
344 |
-
self.
|
345 |
-
self.mel_extractor = MelSpectrogram(
|
346 |
-
is_half, 128, 16000, 1024, 160, None, 30, 8000
|
347 |
-
).to(device)
|
348 |
-
self.model = self.model.to(device)
|
349 |
cents_mapping = 20 * np.arange(360) + 1997.3794084376191
|
350 |
-
self.cents_mapping = np.pad(cents_mapping, (4, 4))
|
351 |
|
352 |
def mel2hidden(self, mel):
|
353 |
with torch.no_grad():
|
354 |
n_frames = mel.shape[-1]
|
355 |
-
mel = mel.float()
|
356 |
-
mel = F.pad(
|
357 |
-
|
358 |
-
|
359 |
hidden = self.model(mel)
|
360 |
return hidden[:, :n_frames]
|
361 |
|
@@ -370,7 +365,7 @@ class RMVPE:
|
|
370 |
mel = self.mel_extractor(audio, center=True)
|
371 |
hidden = self.mel2hidden(mel)
|
372 |
hidden = hidden.squeeze(0).cpu().numpy()
|
373 |
-
if self.is_half
|
374 |
hidden = hidden.astype("float32")
|
375 |
f0 = self.decode(hidden, thred=thred)
|
376 |
return f0
|
@@ -384,23 +379,23 @@ class RMVPE:
|
|
384 |
starts = center - 4
|
385 |
ends = center + 5
|
386 |
for idx in range(salience.shape[0]):
|
387 |
-
todo_salience.append(salience[:, starts[idx]
|
388 |
-
todo_cents_mapping.append(self.cents_mapping[starts[idx]
|
389 |
todo_salience = np.array(todo_salience)
|
390 |
todo_cents_mapping = np.array(todo_cents_mapping)
|
391 |
product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
|
392 |
weight_sum = np.sum(todo_salience, 1)
|
393 |
-
|
394 |
maxx = np.max(salience, axis=1)
|
395 |
-
|
396 |
-
return
|
397 |
|
398 |
def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
|
399 |
audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
|
400 |
mel = self.mel_extractor(audio, center=True)
|
401 |
hidden = self.mel2hidden(mel)
|
402 |
hidden = hidden.squeeze(0).cpu().numpy()
|
403 |
-
if self.is_half
|
404 |
hidden = hidden.astype("float32")
|
405 |
f0 = self.decode(hidden, thred=thred)
|
406 |
f0[(f0 < f0_min) | (f0 > f0_max)] = 0
|
|
|
334 |
ckpt = torch.load(model_path, map_location="cpu")
|
335 |
model.load_state_dict(ckpt)
|
336 |
model.eval()
|
337 |
+
if is_half:
|
338 |
model = model.half()
|
339 |
self.model = model
|
|
|
340 |
self.is_half = is_half
|
341 |
+
self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
|
342 |
+
self.mel_extractor = MelSpectrogram(is_half, 128, 16000, 1024, 160, None, 30, 8000).to(self.device)
|
343 |
+
self.model = self.model.to(self.device)
|
|
|
|
|
|
|
|
|
344 |
cents_mapping = 20 * np.arange(360) + 1997.3794084376191
|
345 |
+
self.cents_mapping = np.pad(cents_mapping, (4, 4))
|
346 |
|
347 |
def mel2hidden(self, mel):
|
348 |
with torch.no_grad():
|
349 |
n_frames = mel.shape[-1]
|
350 |
+
mel = mel.float()
|
351 |
+
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect")
|
352 |
+
if self.is_half:
|
353 |
+
mel = mel.half()
|
354 |
hidden = self.model(mel)
|
355 |
return hidden[:, :n_frames]
|
356 |
|
|
|
365 |
mel = self.mel_extractor(audio, center=True)
|
366 |
hidden = self.mel2hidden(mel)
|
367 |
hidden = hidden.squeeze(0).cpu().numpy()
|
368 |
+
if self.is_half:
|
369 |
hidden = hidden.astype("float32")
|
370 |
f0 = self.decode(hidden, thred=thred)
|
371 |
return f0
|
|
|
379 |
starts = center - 4
|
380 |
ends = center + 5
|
381 |
for idx in range(salience.shape[0]):
|
382 |
+
todo_salience.append(salience[:, starts[idx]:ends[idx]][idx])
|
383 |
+
todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]])
|
384 |
todo_salience = np.array(todo_salience)
|
385 |
todo_cents_mapping = np.array(todo_cents_mapping)
|
386 |
product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
|
387 |
weight_sum = np.sum(todo_salience, 1)
|
388 |
+
divided = product_sum / weight_sum
|
389 |
maxx = np.max(salience, axis=1)
|
390 |
+
divided[maxx <= thred] = 0
|
391 |
+
return divided
|
392 |
|
393 |
def infer_from_audio_with_pitch(self, audio, thred=0.03, f0_min=50, f0_max=1100):
|
394 |
audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
|
395 |
mel = self.mel_extractor(audio, center=True)
|
396 |
hidden = self.mel2hidden(mel)
|
397 |
hidden = hidden.squeeze(0).cpu().numpy()
|
398 |
+
if self.is_half:
|
399 |
hidden = hidden.astype("float32")
|
400 |
f0 = self.decode(hidden, thred=thred)
|
401 |
f0[(f0 < f0_min) | (f0 > f0_max)] = 0
|