Spaces:
Running
Running
Katock
commited on
Commit
·
13b8440
1
Parent(s):
ee4a2a4
Update utils.py
Browse files
utils.py
CHANGED
@@ -66,6 +66,84 @@ def plot_data_to_numpy(x, y):
|
|
66 |
plt.close()
|
67 |
return data
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
def f0_to_coarse(f0):
|
71 |
is_torch = isinstance(f0, torch.Tensor)
|
@@ -78,6 +156,35 @@ def f0_to_coarse(f0):
|
|
78 |
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
|
79 |
return f0_coarse
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
def get_content(cmodel, y):
|
82 |
with torch.no_grad():
|
83 |
c = cmodel.extract_features(y.squeeze(1))[0]
|
@@ -174,7 +281,6 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
|
|
174 |
checkpoint_path, iteration))
|
175 |
return model, optimizer, learning_rate, iteration
|
176 |
|
177 |
-
|
178 |
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
|
179 |
logger.info("Saving model and optimizer state at iteration {} to {}".format(
|
180 |
iteration, checkpoint_path))
|
@@ -292,47 +398,6 @@ def load_filepaths_and_text(filename, split="|"):
|
|
292 |
return filepaths_and_text
|
293 |
|
294 |
|
295 |
-
def get_hparams(init=True):
|
296 |
-
parser = argparse.ArgumentParser()
|
297 |
-
parser.add_argument('-c', '--config', type=str, default="./configs/config.json",
|
298 |
-
help='JSON file for configuration')
|
299 |
-
parser.add_argument('-m', '--model', type=str, required=True,
|
300 |
-
help='Model name')
|
301 |
-
|
302 |
-
args = parser.parse_args()
|
303 |
-
model_dir = os.path.join("./logs", args.model)
|
304 |
-
|
305 |
-
if not os.path.exists(model_dir):
|
306 |
-
os.makedirs(model_dir)
|
307 |
-
|
308 |
-
config_path = args.config
|
309 |
-
config_save_path = os.path.join(model_dir, "config.json")
|
310 |
-
if init:
|
311 |
-
with open(config_path, "r") as f:
|
312 |
-
data = f.read()
|
313 |
-
with open(config_save_path, "w") as f:
|
314 |
-
f.write(data)
|
315 |
-
else:
|
316 |
-
with open(config_save_path, "r") as f:
|
317 |
-
data = f.read()
|
318 |
-
config = json.loads(data)
|
319 |
-
|
320 |
-
hparams = HParams(**config)
|
321 |
-
hparams.model_dir = model_dir
|
322 |
-
return hparams
|
323 |
-
|
324 |
-
|
325 |
-
def get_hparams_from_dir(model_dir):
|
326 |
-
config_save_path = os.path.join(model_dir, "config.json")
|
327 |
-
with open(config_save_path, "r") as f:
|
328 |
-
data = f.read()
|
329 |
-
config = json.loads(data)
|
330 |
-
|
331 |
-
hparams =HParams(**config)
|
332 |
-
hparams.model_dir = model_dir
|
333 |
-
return hparams
|
334 |
-
|
335 |
-
|
336 |
def get_hparams_from_file(config_path):
|
337 |
with open(config_path, "r") as f:
|
338 |
data = f.read()
|
@@ -393,19 +458,7 @@ def repeat_expand_2d(content, target_len):
|
|
393 |
return target
|
394 |
|
395 |
|
396 |
-
|
397 |
-
mix_rate = torch.FloatTensor(mix_rate)/100
|
398 |
-
model_tem = torch.load(model_paths[0])
|
399 |
-
models = [torch.load(path)["model"] for path in model_paths]
|
400 |
-
if mode == 0:
|
401 |
-
mix_rate = F.softmax(mix_rate,dim=0)
|
402 |
-
for k in model_tem["model"].keys():
|
403 |
-
model_tem["model"][k] = torch.zeros_like(model_tem["model"][k])
|
404 |
-
for i,model in enumerate(models):
|
405 |
-
model_tem["model"][k] += model[k]*mix_rate[i]
|
406 |
-
torch.save(model_tem,os.path.join(os.path.curdir,"output.pth"))
|
407 |
-
return os.path.join(os.path.curdir,"output.pth")
|
408 |
-
|
409 |
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 from RVC
|
410 |
# print(data1.max(),data2.max())
|
411 |
rms1 = librosa.feature.rms(
|
|
|
66 |
plt.close()
|
67 |
return data
|
68 |
|
69 |
+
def interpolate_f0(f0):
|
70 |
+
'''
|
71 |
+
对F0进行插值处理
|
72 |
+
'''
|
73 |
+
|
74 |
+
data = np.reshape(f0, (f0.size, 1))
|
75 |
+
|
76 |
+
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
77 |
+
vuv_vector[data > 0.0] = 1.0
|
78 |
+
vuv_vector[data <= 0.0] = 0.0
|
79 |
+
|
80 |
+
ip_data = data
|
81 |
+
|
82 |
+
frame_number = data.size
|
83 |
+
last_value = 0.0
|
84 |
+
for i in range(frame_number):
|
85 |
+
if data[i] <= 0.0:
|
86 |
+
j = i + 1
|
87 |
+
for j in range(i + 1, frame_number):
|
88 |
+
if data[j] > 0.0:
|
89 |
+
break
|
90 |
+
if j < frame_number - 1:
|
91 |
+
if last_value > 0.0:
|
92 |
+
step = (data[j] - data[i - 1]) / float(j - i)
|
93 |
+
for k in range(i, j):
|
94 |
+
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
95 |
+
else:
|
96 |
+
for k in range(i, j):
|
97 |
+
ip_data[k] = data[j]
|
98 |
+
else:
|
99 |
+
for k in range(i, frame_number):
|
100 |
+
ip_data[k] = last_value
|
101 |
+
else:
|
102 |
+
ip_data[i] = data[i]
|
103 |
+
last_value = data[i]
|
104 |
+
|
105 |
+
return ip_data[:,0], vuv_vector[:,0]
|
106 |
+
|
107 |
+
def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
|
108 |
+
import parselmouth
|
109 |
+
x = wav_numpy
|
110 |
+
if p_len is None:
|
111 |
+
p_len = x.shape[0]//hop_length
|
112 |
+
else:
|
113 |
+
assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error"
|
114 |
+
time_step = hop_length / sampling_rate * 1000
|
115 |
+
f0_min = 50
|
116 |
+
f0_max = 1100
|
117 |
+
f0 = parselmouth.Sound(x, sampling_rate).to_pitch_ac(
|
118 |
+
time_step=time_step / 1000, voicing_threshold=0.6,
|
119 |
+
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
|
120 |
+
|
121 |
+
pad_size=(p_len - len(f0) + 1) // 2
|
122 |
+
if(pad_size>0 or p_len - len(f0) - pad_size>0):
|
123 |
+
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
|
124 |
+
return f0
|
125 |
+
|
126 |
+
def resize_f0(x, target_len):
|
127 |
+
source = np.array(x)
|
128 |
+
source[source<0.001] = np.nan
|
129 |
+
target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
|
130 |
+
res = np.nan_to_num(target)
|
131 |
+
return res
|
132 |
+
|
133 |
+
def compute_f0_dio(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
|
134 |
+
import pyworld
|
135 |
+
if p_len is None:
|
136 |
+
p_len = wav_numpy.shape[0]//hop_length
|
137 |
+
f0, t = pyworld.dio(
|
138 |
+
wav_numpy.astype(np.double),
|
139 |
+
fs=sampling_rate,
|
140 |
+
f0_ceil=800,
|
141 |
+
frame_period=1000 * hop_length / sampling_rate,
|
142 |
+
)
|
143 |
+
f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
|
144 |
+
for index, pitch in enumerate(f0):
|
145 |
+
f0[index] = round(pitch, 1)
|
146 |
+
return resize_f0(f0, p_len)
|
147 |
|
148 |
def f0_to_coarse(f0):
|
149 |
is_torch = isinstance(f0, torch.Tensor)
|
|
|
156 |
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
|
157 |
return f0_coarse
|
158 |
|
159 |
+
def get_hubert_model():
|
160 |
+
vec_path = "hubert/checkpoint_best_legacy_500.pt"
|
161 |
+
print("load model(s) from {}".format(vec_path))
|
162 |
+
from fairseq import checkpoint_utils
|
163 |
+
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
164 |
+
[vec_path],
|
165 |
+
suffix="",
|
166 |
+
)
|
167 |
+
model = models[0]
|
168 |
+
model.eval()
|
169 |
+
return model
|
170 |
+
|
171 |
+
def get_hubert_content(hmodel, wav_16k_tensor):
|
172 |
+
feats = wav_16k_tensor
|
173 |
+
if feats.dim() == 2: # double channels
|
174 |
+
feats = feats.mean(-1)
|
175 |
+
assert feats.dim() == 1, feats.dim()
|
176 |
+
feats = feats.view(1, -1)
|
177 |
+
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
178 |
+
inputs = {
|
179 |
+
"source": feats.to(wav_16k_tensor.device),
|
180 |
+
"padding_mask": padding_mask.to(wav_16k_tensor.device),
|
181 |
+
"output_layer": 9, # layer 9
|
182 |
+
}
|
183 |
+
with torch.no_grad():
|
184 |
+
logits = hmodel.extract_features(**inputs)
|
185 |
+
feats = hmodel.final_proj(logits[0])
|
186 |
+
return feats.transpose(1, 2)
|
187 |
+
|
188 |
def get_content(cmodel, y):
|
189 |
with torch.no_grad():
|
190 |
c = cmodel.extract_features(y.squeeze(1))[0]
|
|
|
281 |
checkpoint_path, iteration))
|
282 |
return model, optimizer, learning_rate, iteration
|
283 |
|
|
|
284 |
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
|
285 |
logger.info("Saving model and optimizer state at iteration {} to {}".format(
|
286 |
iteration, checkpoint_path))
|
|
|
398 |
return filepaths_and_text
|
399 |
|
400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
def get_hparams_from_file(config_path):
|
402 |
with open(config_path, "r") as f:
|
403 |
data = f.read()
|
|
|
458 |
return target
|
459 |
|
460 |
|
461 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 from RVC
|
463 |
# print(data1.max(),data2.max())
|
464 |
rms1 = librosa.feature.rms(
|