Hev832 commited on
Commit
f64e5e8
·
verified ·
1 Parent(s): a6d0942

Create rvc.py

Browse files
Files changed (1) hide show
  1. rvc.py +147 -0
rvc.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from multiprocessing import cpu_count
2
+ from pathlib import Path
3
+
4
+ import torch
5
+ from fairseq import checkpoint_utils
6
+ from scipy.io import wavfile
7
+
8
+ from infer_pack.models import (
9
+ SynthesizerTrnMs256NSFsid,
10
+ SynthesizerTrnMs256NSFsid_nono,
11
+ SynthesizerTrnMs768NSFsid,
12
+ SynthesizerTrnMs768NSFsid_nono,
13
+ )
14
+ from my_utils import load_audio
15
+ from vc_infer_pipeline import VC
16
+
17
+ BASE_DIR = Path(__file__).resolve().parent
18
+
19
+ class Config:
20
+ def __init__(self, device, is_half):
21
+ self.device = device
22
+ self.is_half = is_half
23
+ self.n_cpu = 0
24
+ self.gpu_name = None
25
+ self.gpu_mem = None
26
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
27
+
28
+ def device_config(self) -> tuple:
29
+ if torch.cuda.is_available():
30
+ i_device = int(self.device.split(":")[-1])
31
+ self.gpu_name = torch.cuda.get_device_name(i_device)
32
+ if (
33
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
34
+ or "P40" in self.gpu_name.upper()
35
+ or "1060" in self.gpu_name
36
+ or "1070" in self.gpu_name
37
+ or "1080" in self.gpu_name
38
+ ):
39
+ print("16 series/10 series P40 forced single precision")
40
+ self.is_half = False
41
+ else:
42
+ self.gpu_name = None
43
+ self.gpu_mem = int(
44
+ torch.cuda.get_device_properties(i_device).total_memory
45
+ / 1024
46
+ / 1024
47
+ / 1024
48
+ + 0.4
49
+ )
50
+ elif torch.backends.mps.is_available():
51
+ print("No supported N-card found, use MPS for inference")
52
+ self.device = "mps"
53
+ else:
54
+ print("No supported N-card found, use CPU for inference")
55
+ self.device = "cpu"
56
+ self.is_half = True
57
+
58
+ if self.n_cpu == 0:
59
+ self.n_cpu = cpu_count()
60
+
61
+ if self.is_half:
62
+ # 6G memory config
63
+ x_pad = 3
64
+ x_query = 10
65
+ x_center = 60
66
+ x_max = 65
67
+ else:
68
+ # 5G memory config
69
+ x_pad = 1
70
+ x_query = 6
71
+ x_center = 38
72
+ x_max = 41
73
+
74
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
75
+ x_pad = 1
76
+ x_query = 5
77
+ x_center = 30
78
+ x_max = 32
79
+
80
+ return x_pad, x_query, x_center, x_max
81
+
82
+ def load_hubert(device, is_half, model_path):
83
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([model_path], suffix='')
84
+ hubert = models[0]
85
+ hubert = hubert.to(device)
86
+
87
+ if is_half:
88
+ hubert = hubert.half()
89
+ else:
90
+ hubert = hubert.float()
91
+
92
+ hubert.eval()
93
+ return hubert
94
+
95
+ def get_vc(device, is_half, config, model_path):
96
+ cpt = torch.load(model_path, map_location='cpu')
97
+ if "config" not in cpt or "weight" not in cpt:
98
+ raise ValueError(f'Incorrect format for {model_path}. Use a voice model trained using RVC v2 instead.')
99
+
100
+ tgt_sr = cpt["config"][-1]
101
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
102
+ if_f0 = cpt.get("f0", 1)
103
+ version = cpt.get("version", "v1")
104
+
105
+ if version == "v1":
106
+ if if_f0 == 1:
107
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
108
+ else:
109
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
110
+ elif version == "v2":
111
+ if if_f0 == 1:
112
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
113
+ else:
114
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
115
+
116
+ del net_g.enc_q
117
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
118
+ net_g.eval().to(device)
119
+
120
+ if is_half:
121
+ net_g = net_g.half()
122
+ else:
123
+ net_g = net_g.float()
124
+
125
+ vc = VC(tgt_sr, config)
126
+ return cpt, version, net_g, tgt_sr, vc
127
+
128
+ def rvc_infer(index_path, index_rate, input_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model):
129
+ # Load the input audio file
130
+ audio = load_audio(input_path, 16000)
131
+
132
+ # Initialize a list to keep track of times
133
+ times = [0, 0, 0]
134
+
135
+ # Determine if F0 (fundamental frequency) is used
136
+ if_f0 = cpt.get('f0', 1)
137
+
138
+ # Run the voice conversion pipeline
139
+ audio_opt = vc.pipeline(
140
+ hubert_model, net_g, 0, audio, input_path, times,
141
+ pitch_change, f0_method, index_path, index_rate,
142
+ if_f0, filter_radius, tgt_sr, 0, rms_mix_rate,
143
+ version, protect, crepe_hop_length
144
+ )
145
+
146
+ # Write the output audio to a file
147
+ wavfile.write(output_path, tgt_sr, audio_opt)