Spaces:

GroveStreet
/

GTA_SOVITS

Running

App Files Files Community

Katock commited on Jul 28, 2023

Commit

13b8440

1 Parent(s): ee4a2a4

Update utils.py

Browse files

Files changed (1) hide show

utils.py +108 -55

utils.py CHANGED Viewed

@@ -66,6 +66,84 @@ def plot_data_to_numpy(x, y):
     plt.close()
     return data
 def f0_to_coarse(f0):
   is_torch = isinstance(f0, torch.Tensor)
@@ -78,6 +156,35 @@ def f0_to_coarse(f0):
   assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
   return f0_coarse
 def get_content(cmodel, y):
     with torch.no_grad():
         c = cmodel.extract_features(y.squeeze(1))[0]
@@ -174,7 +281,6 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
         checkpoint_path, iteration))
     return model, optimizer, learning_rate, iteration
 def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
   logger.info("Saving model and optimizer state at iteration {} to {}".format(
     iteration, checkpoint_path))
@@ -292,47 +398,6 @@ def load_filepaths_and_text(filename, split="|"):
   return filepaths_and_text
-def get_hparams(init=True):
-  parser = argparse.ArgumentParser()
-  parser.add_argument('-c', '--config', type=str, default="./configs/config.json",
-                      help='JSON file for configuration')
-  parser.add_argument('-m', '--model', type=str, required=True,
-                      help='Model name')
-  args = parser.parse_args()
-  model_dir = os.path.join("./logs", args.model)
-  if not os.path.exists(model_dir):
-    os.makedirs(model_dir)
-  config_path = args.config
-  config_save_path = os.path.join(model_dir, "config.json")
-  if init:
-    with open(config_path, "r") as f:
-      data = f.read()
-    with open(config_save_path, "w") as f:
-      f.write(data)
-  else:
-    with open(config_save_path, "r") as f:
-      data = f.read()
-  config = json.loads(data)
-  hparams = HParams(**config)
-  hparams.model_dir = model_dir
-  return hparams
-def get_hparams_from_dir(model_dir):
-  config_save_path = os.path.join(model_dir, "config.json")
-  with open(config_save_path, "r") as f:
-    data = f.read()
-  config = json.loads(data)
-  hparams =HParams(**config)
-  hparams.model_dir = model_dir
-  return hparams
 def get_hparams_from_file(config_path):
   with open(config_path, "r") as f:
     data = f.read()
@@ -393,19 +458,7 @@ def repeat_expand_2d(content, target_len):
     return target
-def mix_model(model_paths,mix_rate,mode):
-  mix_rate = torch.FloatTensor(mix_rate)/100
-  model_tem = torch.load(model_paths[0])
-  models = [torch.load(path)["model"] for path in model_paths]
-  if mode == 0:
-     mix_rate = F.softmax(mix_rate,dim=0)
-  for k in model_tem["model"].keys():
-     model_tem["model"][k] = torch.zeros_like(model_tem["model"][k])
-     for i,model in enumerate(models):
-        model_tem["model"][k] += model[k]*mix_rate[i]
-  torch.save(model_tem,os.path.join(os.path.curdir,"output.pth"))
-  return os.path.join(os.path.curdir,"output.pth")
 def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出音频,rate是2的占比 from RVC
     # print(data1.max(),data2.max())
     rms1 = librosa.feature.rms(

     plt.close()
     return data
+def interpolate_f0(f0):
+    '''
+    对F0进行插值处理
+    '''
+    data = np.reshape(f0, (f0.size, 1))
+    vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
+    vuv_vector[data > 0.0] = 1.0
+    vuv_vector[data <= 0.0] = 0.0
+    ip_data = data
+    frame_number = data.size
+    last_value = 0.0
+    for i in range(frame_number):
+        if data[i] <= 0.0:
+            j = i + 1
+            for j in range(i + 1, frame_number):
+                if data[j] > 0.0:
+                    break
+            if j < frame_number - 1:
+                if last_value > 0.0:
+                    step = (data[j] - data[i - 1]) / float(j - i)
+                    for k in range(i, j):
+                        ip_data[k] = data[i - 1] + step * (k - i + 1)
+                else:
+                    for k in range(i, j):
+                        ip_data[k] = data[j]
+            else:
+                for k in range(i, frame_number):
+                    ip_data[k] = last_value
+        else:
+            ip_data[i] = data[i]
+            last_value = data[i]
+    return ip_data[:,0], vuv_vector[:,0]
+def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
+    import parselmouth
+    x = wav_numpy
+    if p_len is None:
+        p_len = x.shape[0]//hop_length
+    else:
+        assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error"
+    time_step = hop_length / sampling_rate * 1000
+    f0_min = 50
+    f0_max = 1100
+    f0 = parselmouth.Sound(x, sampling_rate).to_pitch_ac(
+        time_step=time_step / 1000, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    pad_size=(p_len - len(f0) + 1) // 2
+    if(pad_size>0 or p_len - len(f0) - pad_size>0):
+        f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
+    return f0
+def resize_f0(x, target_len):
+    source = np.array(x)
+    source[source<0.001] = np.nan
+    target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
+    res = np.nan_to_num(target)
+    return res
+def compute_f0_dio(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
+    import pyworld
+    if p_len is None:
+        p_len = wav_numpy.shape[0]//hop_length
+    f0, t = pyworld.dio(
+        wav_numpy.astype(np.double),
+        fs=sampling_rate,
+        f0_ceil=800,
+        frame_period=1000 * hop_length / sampling_rate,
+    )
+    f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
+    for index, pitch in enumerate(f0):
+        f0[index] = round(pitch, 1)
+    return resize_f0(f0, p_len)
 def f0_to_coarse(f0):
   is_torch = isinstance(f0, torch.Tensor)
   assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
   return f0_coarse
+def get_hubert_model():
+  vec_path = "hubert/checkpoint_best_legacy_500.pt"
+  print("load model(s) from {}".format(vec_path))
+  from fairseq import checkpoint_utils
+  models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+    [vec_path],
+    suffix="",
+  )
+  model = models[0]
+  model.eval()
+  return model
+def get_hubert_content(hmodel, wav_16k_tensor):
+  feats = wav_16k_tensor
+  if feats.dim() == 2:  # double channels
+    feats = feats.mean(-1)
+  assert feats.dim() == 1, feats.dim()
+  feats = feats.view(1, -1)
+  padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+  inputs = {
+    "source": feats.to(wav_16k_tensor.device),
+    "padding_mask": padding_mask.to(wav_16k_tensor.device),
+    "output_layer": 9,  # layer 9
+  }
+  with torch.no_grad():
+    logits = hmodel.extract_features(**inputs)
+    feats = hmodel.final_proj(logits[0])
+  return feats.transpose(1, 2)
 def get_content(cmodel, y):
     with torch.no_grad():
         c = cmodel.extract_features(y.squeeze(1))[0]
         checkpoint_path, iteration))
     return model, optimizer, learning_rate, iteration
 def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
   logger.info("Saving model and optimizer state at iteration {} to {}".format(
     iteration, checkpoint_path))
   return filepaths_and_text
 def get_hparams_from_file(config_path):
   with open(config_path, "r") as f:
     data = f.read()
     return target
 def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出音频,rate是2的占比 from RVC
     # print(data1.max(),data2.max())
     rms1 = librosa.feature.rms(