RO-Rtechs commited on
Commit
7e6d508
·
verified ·
1 Parent(s): 3bc4b90

Upload 8 files

Browse files
Files changed (8) hide show
  1. .gitattributes +35 -35
  2. README.md +14 -13
  3. app.py +1474 -0
  4. mdx_models/data.json +354 -0
  5. packages.txt +1 -0
  6. requirements.txt +7 -0
  7. test.mp3 +0 -0
  8. utils.py +142 -0
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,14 @@
1
- ---
2
- title: Audio Separator-rtechs
3
- emoji: 🔥
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
+ ---
2
+ title: Rtechs_Audio-🤖-Effects_Separator 2024
3
+ emoji: 😶‍🌫️
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 4.28.3
8
+ app_file: app.py
9
+ pinned: true
10
+ license: mit
11
+ short_description: Rtechs Vocal and background audio separator
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,1474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # os.system("pip install ./ort_nightly_gpu-1.17.0.dev20240118002-cp310-cp310-manylinux_2_28_x86_64.whl")
3
+ os.system("pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/")
4
+ import gc
5
+ import hashlib
6
+ import queue
7
+ import threading
8
+ import json
9
+ import shlex
10
+ import sys
11
+ import subprocess
12
+ import librosa
13
+ import numpy as np
14
+ import soundfile as sf
15
+ import torch
16
+ from tqdm import tqdm
17
+ from utils import (
18
+ remove_directory_contents,
19
+ create_directories,
20
+ download_manager,
21
+ )
22
+ import random
23
+ import spaces
24
+ from utils import logger
25
+ import onnxruntime as ort
26
+ import warnings
27
+ import spaces
28
+ import gradio as gr
29
+ import logging
30
+ import time
31
+ import traceback
32
+ from pedalboard import Pedalboard, Reverb, Delay, Chorus, Compressor, Gain, HighpassFilter, LowpassFilter
33
+ from pedalboard.io import AudioFile
34
+ import numpy as np
35
+ import yt_dlp
36
+
37
+ warnings.filterwarnings("ignore")
38
+
39
+ title = "<center><strong><font size='7'>Rtechs_Vocal-Audio 🎺 separator</font></strong></center>"
40
+ description = "This App is for vocal and background sound separation."
41
+ theme = "ParityError/LimeFace"
42
+
43
+ stem_naming = {
44
+ "Vocals": "Instrumental",
45
+ "Other": "Instruments",
46
+ "Instrumental": "Vocals",
47
+ "Drums": "Drumless",
48
+ "Bass": "Bassless",
49
+ }
50
+
51
+
52
+ class MDXModel:
53
+ def __init__(
54
+ self,
55
+ device,
56
+ dim_f,
57
+ dim_t,
58
+ n_fft,
59
+ hop=1024,
60
+ stem_name=None,
61
+ compensation=1.000,
62
+ ):
63
+ self.dim_f = dim_f
64
+ self.dim_t = dim_t
65
+ self.dim_c = 4
66
+ self.n_fft = n_fft
67
+ self.hop = hop
68
+ self.stem_name = stem_name
69
+ self.compensation = compensation
70
+
71
+ self.n_bins = self.n_fft // 2 + 1
72
+ self.chunk_size = hop * (self.dim_t - 1)
73
+ self.window = torch.hann_window(
74
+ window_length=self.n_fft, periodic=True
75
+ ).to(device)
76
+
77
+ out_c = self.dim_c
78
+
79
+ self.freq_pad = torch.zeros(
80
+ [1, out_c, self.n_bins - self.dim_f, self.dim_t]
81
+ ).to(device)
82
+
83
+ def stft(self, x):
84
+ x = x.reshape([-1, self.chunk_size])
85
+ x = torch.stft(
86
+ x,
87
+ n_fft=self.n_fft,
88
+ hop_length=self.hop,
89
+ window=self.window,
90
+ center=True,
91
+ return_complex=True,
92
+ )
93
+ x = torch.view_as_real(x)
94
+ x = x.permute([0, 3, 1, 2])
95
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
96
+ [-1, 4, self.n_bins, self.dim_t]
97
+ )
98
+ return x[:, :, : self.dim_f]
99
+
100
+ def istft(self, x, freq_pad=None):
101
+ freq_pad = (
102
+ self.freq_pad.repeat([x.shape[0], 1, 1, 1])
103
+ if freq_pad is None
104
+ else freq_pad
105
+ )
106
+ x = torch.cat([x, freq_pad], -2)
107
+ # c = 4*2 if self.target_name=='*' else 2
108
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
109
+ [-1, 2, self.n_bins, self.dim_t]
110
+ )
111
+ x = x.permute([0, 2, 3, 1])
112
+ x = x.contiguous()
113
+ x = torch.view_as_complex(x)
114
+ x = torch.istft(
115
+ x,
116
+ n_fft=self.n_fft,
117
+ hop_length=self.hop,
118
+ window=self.window,
119
+ center=True,
120
+ )
121
+ return x.reshape([-1, 2, self.chunk_size])
122
+
123
+
124
+ class MDX:
125
+ DEFAULT_SR = 44100
126
+ # Unit: seconds
127
+ DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
128
+ DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
129
+
130
+ def __init__(
131
+ self, model_path: str, params: MDXModel, processor=0
132
+ ):
133
+ # Set the device and the provider (CPU or CUDA)
134
+ self.device = (
135
+ torch.device(f"cuda:{processor}")
136
+ if processor >= 0
137
+ else torch.device("cpu")
138
+ )
139
+ self.provider = (
140
+ ["CUDAExecutionProvider"]
141
+ if processor >= 0
142
+ else ["CPUExecutionProvider"]
143
+ )
144
+
145
+ self.model = params
146
+
147
+ # Load the ONNX model using ONNX Runtime
148
+ self.ort = ort.InferenceSession(model_path, providers=self.provider)
149
+ # Preload the model for faster performance
150
+ self.ort.run(
151
+ None,
152
+ {"input": torch.rand(1, 4, params.dim_f, params.dim_t).numpy()},
153
+ )
154
+ self.process = lambda spec: self.ort.run(
155
+ None, {"input": spec.cpu().numpy()}
156
+ )[0]
157
+
158
+ self.prog = None
159
+
160
+ @staticmethod
161
+ def get_hash(model_path):
162
+ try:
163
+ with open(model_path, "rb") as f:
164
+ f.seek(-10000 * 1024, 2)
165
+ model_hash = hashlib.md5(f.read()).hexdigest()
166
+ except: # noqa
167
+ model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
168
+
169
+ return model_hash
170
+
171
+ @staticmethod
172
+ def segment(
173
+ wave,
174
+ combine=True,
175
+ chunk_size=DEFAULT_CHUNK_SIZE,
176
+ margin_size=DEFAULT_MARGIN_SIZE,
177
+ ):
178
+ """
179
+ Segment or join segmented wave array
180
+
181
+ Args:
182
+ wave: (np.array) Wave array to be segmented or joined
183
+ combine: (bool) If True, combines segmented wave array.
184
+ If False, segments wave array.
185
+ chunk_size: (int) Size of each segment (in samples)
186
+ margin_size: (int) Size of margin between segments (in samples)
187
+
188
+ Returns:
189
+ numpy array: Segmented or joined wave array
190
+ """
191
+
192
+ if combine:
193
+ # Initializing as None instead of [] for later numpy array concatenation
194
+ processed_wave = None
195
+ for segment_count, segment in enumerate(wave):
196
+ start = 0 if segment_count == 0 else margin_size
197
+ end = None if segment_count == len(wave) - 1 else -margin_size
198
+ if margin_size == 0:
199
+ end = None
200
+ if processed_wave is None: # Create array for first segment
201
+ processed_wave = segment[:, start:end]
202
+ else: # Concatenate to existing array for subsequent segments
203
+ processed_wave = np.concatenate(
204
+ (processed_wave, segment[:, start:end]), axis=-1
205
+ )
206
+
207
+ else:
208
+ processed_wave = []
209
+ sample_count = wave.shape[-1]
210
+
211
+ if chunk_size <= 0 or chunk_size > sample_count:
212
+ chunk_size = sample_count
213
+
214
+ if margin_size > chunk_size:
215
+ margin_size = chunk_size
216
+
217
+ for segment_count, skip in enumerate(
218
+ range(0, sample_count, chunk_size)
219
+ ):
220
+ margin = 0 if segment_count == 0 else margin_size
221
+ end = min(skip + chunk_size + margin_size, sample_count)
222
+ start = skip - margin
223
+
224
+ cut = wave[:, start:end].copy()
225
+ processed_wave.append(cut)
226
+
227
+ if end == sample_count:
228
+ break
229
+
230
+ return processed_wave
231
+
232
+ def pad_wave(self, wave):
233
+ """
234
+ Pad the wave array to match the required chunk size
235
+
236
+ Args:
237
+ wave: (np.array) Wave array to be padded
238
+
239
+ Returns:
240
+ tuple: (padded_wave, pad, trim)
241
+ - padded_wave: Padded wave array
242
+ - pad: Number of samples that were padded
243
+ - trim: Number of samples that were trimmed
244
+ """
245
+ n_sample = wave.shape[1]
246
+ trim = self.model.n_fft // 2
247
+ gen_size = self.model.chunk_size - 2 * trim
248
+ pad = gen_size - n_sample % gen_size
249
+
250
+ # Padded wave
251
+ wave_p = np.concatenate(
252
+ (
253
+ np.zeros((2, trim)),
254
+ wave,
255
+ np.zeros((2, pad)),
256
+ np.zeros((2, trim)),
257
+ ),
258
+ 1,
259
+ )
260
+
261
+ mix_waves = []
262
+ for i in range(0, n_sample + pad, gen_size):
263
+ waves = np.array(wave_p[:, i:i + self.model.chunk_size])
264
+ mix_waves.append(waves)
265
+
266
+ mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(
267
+ self.device
268
+ )
269
+
270
+ return mix_waves, pad, trim
271
+
272
+ def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
273
+ """
274
+ Process each wave segment in a multi-threaded environment
275
+
276
+ Args:
277
+ mix_waves: (torch.Tensor) Wave segments to be processed
278
+ trim: (int) Number of samples trimmed during padding
279
+ pad: (int) Number of samples padded during padding
280
+ q: (queue.Queue) Queue to hold the processed wave segments
281
+ _id: (int) Identifier of the processed wave segment
282
+
283
+ Returns:
284
+ numpy array: Processed wave segment
285
+ """
286
+ mix_waves = mix_waves.split(1)
287
+ with torch.no_grad():
288
+ pw = []
289
+ for mix_wave in mix_waves:
290
+ self.prog.update()
291
+ spec = self.model.stft(mix_wave)
292
+ processed_spec = torch.tensor(self.process(spec))
293
+ processed_wav = self.model.istft(
294
+ processed_spec.to(self.device)
295
+ )
296
+ processed_wav = (
297
+ processed_wav[:, :, trim:-trim]
298
+ .transpose(0, 1)
299
+ .reshape(2, -1)
300
+ .cpu()
301
+ .numpy()
302
+ )
303
+ pw.append(processed_wav)
304
+ processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
305
+ q.put({_id: processed_signal})
306
+ return processed_signal
307
+
308
+ def process_wave(self, wave: np.array, mt_threads=1):
309
+ """
310
+ Process the wave array in a multi-threaded environment
311
+
312
+ Args:
313
+ wave: (np.array) Wave array to be processed
314
+ mt_threads: (int) Number of threads to be used for processing
315
+
316
+ Returns:
317
+ numpy array: Processed wave array
318
+ """
319
+ self.prog = tqdm(total=0)
320
+ chunk = wave.shape[-1] // mt_threads
321
+ waves = self.segment(wave, False, chunk)
322
+
323
+ # Create a queue to hold the processed wave segments
324
+ q = queue.Queue()
325
+ threads = []
326
+ for c, batch in enumerate(waves):
327
+ mix_waves, pad, trim = self.pad_wave(batch)
328
+ self.prog.total = len(mix_waves) * mt_threads
329
+ thread = threading.Thread(
330
+ target=self._process_wave, args=(mix_waves, trim, pad, q, c)
331
+ )
332
+ thread.start()
333
+ threads.append(thread)
334
+ for thread in threads:
335
+ thread.join()
336
+ self.prog.close()
337
+
338
+ processed_batches = []
339
+ while not q.empty():
340
+ processed_batches.append(q.get())
341
+ processed_batches = [
342
+ list(wave.values())[0]
343
+ for wave in sorted(
344
+ processed_batches, key=lambda d: list(d.keys())[0]
345
+ )
346
+ ]
347
+ assert len(processed_batches) == len(
348
+ waves
349
+ ), "Incomplete processed batches, please reduce batch size!"
350
+ return self.segment(processed_batches, True, chunk)
351
+
352
+
353
+ @spaces.GPU()
354
+ def run_mdx(
355
+ model_params,
356
+ output_dir,
357
+ model_path,
358
+ filename,
359
+ exclude_main=False,
360
+ exclude_inversion=False,
361
+ suffix=None,
362
+ invert_suffix=None,
363
+ denoise=False,
364
+ keep_orig=True,
365
+ m_threads=2,
366
+ device_base="cuda",
367
+ ):
368
+
369
+ if device_base == "cuda":
370
+ device = torch.device("cuda:0")
371
+ processor_num = 0
372
+ device_properties = torch.cuda.get_device_properties(device)
373
+ vram_gb = device_properties.total_memory / 1024**3
374
+ m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
375
+ logger.info(f"threads: {m_threads} vram: {vram_gb}")
376
+ else:
377
+ device = torch.device("cpu")
378
+ processor_num = -1
379
+ m_threads = 1
380
+
381
+ model_hash = MDX.get_hash(model_path)
382
+ mp = model_params.get(model_hash)
383
+ model = MDXModel(
384
+ device,
385
+ dim_f=mp["mdx_dim_f_set"],
386
+ dim_t=2 ** mp["mdx_dim_t_set"],
387
+ n_fft=mp["mdx_n_fft_scale_set"],
388
+ stem_name=mp["primary_stem"],
389
+ compensation=mp["compensate"],
390
+ )
391
+
392
+ mdx_sess = MDX(model_path, model, processor=processor_num)
393
+ wave, sr = librosa.load(filename, mono=False, sr=44100)
394
+ # normalizing input wave gives better output
395
+ peak = max(np.max(wave), abs(np.min(wave)))
396
+ wave /= peak
397
+ if denoise:
398
+ wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
399
+ mdx_sess.process_wave(wave, m_threads)
400
+ )
401
+ wave_processed *= 0.5
402
+ else:
403
+ wave_processed = mdx_sess.process_wave(wave, m_threads)
404
+ # return to previous peak
405
+ wave_processed *= peak
406
+ stem_name = model.stem_name if suffix is None else suffix
407
+
408
+ main_filepath = None
409
+ if not exclude_main:
410
+ main_filepath = os.path.join(
411
+ output_dir,
412
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
413
+ )
414
+ sf.write(main_filepath, wave_processed.T, sr)
415
+
416
+ invert_filepath = None
417
+ if not exclude_inversion:
418
+ diff_stem_name = (
419
+ stem_naming.get(stem_name)
420
+ if invert_suffix is None
421
+ else invert_suffix
422
+ )
423
+ stem_name = (
424
+ f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
425
+ )
426
+ invert_filepath = os.path.join(
427
+ output_dir,
428
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
429
+ )
430
+ sf.write(
431
+ invert_filepath,
432
+ (-wave_processed.T * model.compensation) + wave.T,
433
+ sr,
434
+ )
435
+
436
+ if not keep_orig:
437
+ os.remove(filename)
438
+
439
+ del mdx_sess, wave_processed, wave
440
+ gc.collect()
441
+ torch.cuda.empty_cache()
442
+ return main_filepath, invert_filepath
443
+
444
+
445
+ def run_mdx_beta(
446
+ model_params,
447
+ output_dir,
448
+ model_path,
449
+ filename,
450
+ exclude_main=False,
451
+ exclude_inversion=False,
452
+ suffix=None,
453
+ invert_suffix=None,
454
+ denoise=False,
455
+ keep_orig=True,
456
+ m_threads=2,
457
+ device_base="",
458
+ ):
459
+
460
+ m_threads = 1
461
+ duration = librosa.get_duration(filename=filename)
462
+ if duration >= 60 and duration <= 120:
463
+ m_threads = 8
464
+ elif duration > 120:
465
+ m_threads = 16
466
+
467
+ logger.info(f"threads: {m_threads}")
468
+
469
+ model_hash = MDX.get_hash(model_path)
470
+ device = torch.device("cpu")
471
+ processor_num = -1
472
+ mp = model_params.get(model_hash)
473
+ model = MDXModel(
474
+ device,
475
+ dim_f=mp["mdx_dim_f_set"],
476
+ dim_t=2 ** mp["mdx_dim_t_set"],
477
+ n_fft=mp["mdx_n_fft_scale_set"],
478
+ stem_name=mp["primary_stem"],
479
+ compensation=mp["compensate"],
480
+ )
481
+
482
+ mdx_sess = MDX(model_path, model, processor=processor_num)
483
+ wave, sr = librosa.load(filename, mono=False, sr=44100)
484
+ # normalizing input wave gives better output
485
+ peak = max(np.max(wave), abs(np.min(wave)))
486
+ wave /= peak
487
+ if denoise:
488
+ wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
489
+ mdx_sess.process_wave(wave, m_threads)
490
+ )
491
+ wave_processed *= 0.5
492
+ else:
493
+ wave_processed = mdx_sess.process_wave(wave, m_threads)
494
+ # return to previous peak
495
+ wave_processed *= peak
496
+ stem_name = model.stem_name if suffix is None else suffix
497
+
498
+ main_filepath = None
499
+ if not exclude_main:
500
+ main_filepath = os.path.join(
501
+ output_dir,
502
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
503
+ )
504
+ sf.write(main_filepath, wave_processed.T, sr)
505
+
506
+ invert_filepath = None
507
+ if not exclude_inversion:
508
+ diff_stem_name = (
509
+ stem_naming.get(stem_name)
510
+ if invert_suffix is None
511
+ else invert_suffix
512
+ )
513
+ stem_name = (
514
+ f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
515
+ )
516
+ invert_filepath = os.path.join(
517
+ output_dir,
518
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
519
+ )
520
+ sf.write(
521
+ invert_filepath,
522
+ (-wave_processed.T * model.compensation) + wave.T,
523
+ sr,
524
+ )
525
+
526
+ if not keep_orig:
527
+ os.remove(filename)
528
+
529
+ del mdx_sess, wave_processed, wave
530
+ gc.collect()
531
+ torch.cuda.empty_cache()
532
+ return main_filepath, invert_filepath
533
+
534
+
535
+ MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
536
+ UVR_MODELS = [
537
+ "UVR-MDX-NET-Voc_FT.onnx",
538
+ "UVR_MDXNET_KARA_2.onnx",
539
+ "Reverb_HQ_By_FoxJoy.onnx",
540
+ "UVR-MDX-NET-Inst_HQ_4.onnx",
541
+ ]
542
+ BASE_DIR = "." # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
543
+ mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models")
544
+ output_dir = os.path.join(BASE_DIR, "clean_song_output")
545
+
546
+
547
+ def convert_to_stereo_and_wav(audio_path):
548
+ wave, sr = librosa.load(audio_path, mono=False, sr=44100)
549
+
550
+ # check if mono
551
+ if type(wave[0]) != np.ndarray or audio_path[-4:].lower() != ".wav": # noqa
552
+ stereo_path = f"{os.path.splitext(audio_path)[0]}_stereo.wav"
553
+ stereo_path = os.path.join(output_dir, stereo_path)
554
+
555
+ command = shlex.split(
556
+ f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 2 -f wav "{stereo_path}"'
557
+ )
558
+ sub_params = {
559
+ "stdout": subprocess.PIPE,
560
+ "stderr": subprocess.PIPE,
561
+ "creationflags": subprocess.CREATE_NO_WINDOW
562
+ if sys.platform == "win32"
563
+ else 0,
564
+ }
565
+ process_wav = subprocess.Popen(command, **sub_params)
566
+ output, errors = process_wav.communicate()
567
+ if process_wav.returncode != 0 or not os.path.exists(stereo_path):
568
+ raise Exception("Error processing audio to stereo wav")
569
+
570
+ return stereo_path
571
+ else:
572
+ return audio_path
573
+
574
+
575
+ def get_hash(filepath):
576
+ with open(filepath, 'rb') as f:
577
+ file_hash = hashlib.blake2b()
578
+ while chunk := f.read(8192):
579
+ file_hash.update(chunk)
580
+
581
+ return file_hash.hexdigest()[:18]
582
+
583
+ def random_sleep():
584
+ sleep_time = round(random.uniform(5.2, 7.9), 1)
585
+ time.sleep(sleep_time)
586
+
587
+ def process_uvr_task(
588
+ orig_song_path: str = "aud_test.mp3",
589
+ main_vocals: bool = False,
590
+ dereverb: bool = True,
591
+ song_id: str = "mdx", # folder output name
592
+ only_voiceless: bool = False,
593
+ remove_files_output_dir: bool = False,
594
+ ):
595
+
596
+ device_base = "cuda" if torch.cuda.is_available() else "cpu"
597
+ logger.info(f"Device: {device_base}")
598
+
599
+ if remove_files_output_dir:
600
+ remove_directory_contents(output_dir)
601
+
602
+ with open(os.path.join(mdxnet_models_dir, "data.json")) as infile:
603
+ mdx_model_params = json.load(infile)
604
+
605
+ song_output_dir = os.path.join(output_dir, song_id)
606
+ create_directories(song_output_dir)
607
+ orig_song_path = convert_to_stereo_and_wav(orig_song_path)
608
+
609
+ logger.info(f"onnxruntime device >> {ort.get_device()}")
610
+
611
+ if only_voiceless:
612
+ logger.info("Voiceless Track Separation...")
613
+
614
+ process = run_mdx(
615
+ mdx_model_params,
616
+ song_output_dir,
617
+ os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
618
+ orig_song_path,
619
+ suffix="Voiceless",
620
+ denoise=False,
621
+ keep_orig=True,
622
+ exclude_inversion=True,
623
+ device_base=device_base,
624
+ )
625
+
626
+ return process
627
+
628
+ logger.info("Vocal Track Isolation...")
629
+ vocals_path, instrumentals_path = run_mdx(
630
+ mdx_model_params,
631
+ song_output_dir,
632
+ os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
633
+ orig_song_path,
634
+ denoise=True,
635
+ keep_orig=True,
636
+ device_base=device_base,
637
+ )
638
+
639
+ if main_vocals:
640
+ random_sleep()
641
+ msg_main = "Main Voice Separation from Supporting Vocals..."
642
+ logger.info(msg_main)
643
+ gr.Info(msg_main)
644
+ try:
645
+ backup_vocals_path, main_vocals_path = run_mdx(
646
+ mdx_model_params,
647
+ song_output_dir,
648
+ os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
649
+ vocals_path,
650
+ suffix="Backup",
651
+ invert_suffix="Main",
652
+ denoise=True,
653
+ device_base=device_base,
654
+ )
655
+ except Exception as e:
656
+ backup_vocals_path, main_vocals_path = run_mdx_beta(
657
+ mdx_model_params,
658
+ song_output_dir,
659
+ os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
660
+ vocals_path,
661
+ suffix="Backup",
662
+ invert_suffix="Main",
663
+ denoise=True,
664
+ device_base=device_base,
665
+ )
666
+ else:
667
+ backup_vocals_path, main_vocals_path = None, vocals_path
668
+
669
+ if dereverb:
670
+ random_sleep()
671
+ msg_dereverb = "Vocal Clarity Enhancement through De-Reverberation..."
672
+ logger.info(msg_dereverb)
673
+ gr.Info(msg_dereverb)
674
+ try:
675
+ _, vocals_dereverb_path = run_mdx(
676
+ mdx_model_params,
677
+ song_output_dir,
678
+ os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
679
+ main_vocals_path,
680
+ invert_suffix="DeReverb",
681
+ exclude_main=True,
682
+ denoise=True,
683
+ device_base=device_base,
684
+ )
685
+ except Exception as e:
686
+ _, vocals_dereverb_path = run_mdx_beta(
687
+ mdx_model_params,
688
+ song_output_dir,
689
+ os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
690
+ main_vocals_path,
691
+ invert_suffix="DeReverb",
692
+ exclude_main=True,
693
+ denoise=True,
694
+ device_base=device_base,
695
+ )
696
+ else:
697
+ vocals_dereverb_path = main_vocals_path
698
+
699
+ return (
700
+ vocals_path,
701
+ instrumentals_path,
702
+ backup_vocals_path,
703
+ main_vocals_path,
704
+ vocals_dereverb_path,
705
+ )
706
+
707
+
708
+ def add_vocal_effects(input_file, output_file, reverb_room_size=0.6, vocal_reverb_dryness=0.8, reverb_damping=0.6, reverb_wet_level=0.35,
709
+ delay_seconds=0.4, delay_mix=0.25,
710
+ compressor_threshold_db=-25, compressor_ratio=3.5, compressor_attack_ms=10, compressor_release_ms=60,
711
+ gain_db=3):
712
+
713
+ effects = [HighpassFilter()]
714
+
715
+ effects.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level, dry_level=vocal_reverb_dryness))
716
+
717
+ effects.append(Compressor(threshold_db=compressor_threshold_db, ratio=compressor_ratio,
718
+ attack_ms=compressor_attack_ms, release_ms=compressor_release_ms))
719
+
720
+ if delay_seconds > 0 or delay_mix > 0:
721
+ effects.append(Delay(delay_seconds=delay_seconds, mix=delay_mix))
722
+ print("delay applied")
723
+ # effects.append(Chorus())
724
+
725
+ if gain_db:
726
+ effects.append(Gain(gain_db=gain_db))
727
+ print("added gain db")
728
+
729
+ board = Pedalboard(effects)
730
+
731
+ with AudioFile(input_file) as f:
732
+ with AudioFile(output_file, 'w', f.samplerate, f.num_channels) as o:
733
+ # Read one second of audio at a time, until the file is empty:
734
+ while f.tell() < f.frames:
735
+ chunk = f.read(int(f.samplerate))
736
+ effected = board(chunk, f.samplerate, reset=False)
737
+ o.write(effected)
738
+
739
+
740
+ def add_instrumental_effects(input_file, output_file, highpass_freq=100, lowpass_freq=12000,
741
+ reverb_room_size=0.5, reverb_damping=0.5, reverb_wet_level=0.25,
742
+ compressor_threshold_db=-20, compressor_ratio=2.5, compressor_attack_ms=15, compressor_release_ms=80,
743
+ gain_db=2):
744
+
745
+ effects = [
746
+ HighpassFilter(cutoff_frequency_hz=highpass_freq),
747
+ LowpassFilter(cutoff_frequency_hz=lowpass_freq),
748
+ ]
749
+ if reverb_room_size > 0 or reverb_damping > 0 or reverb_wet_level > 0:
750
+ effects.append(Reverb(room_size=reverb_room_size, damping=reverb_damping, wet_level=reverb_wet_level))
751
+
752
+ effects.append(Compressor(threshold_db=compressor_threshold_db, ratio=compressor_ratio,
753
+ attack_ms=compressor_attack_ms, release_ms=compressor_release_ms))
754
+
755
+ if gain_db:
756
+ effects.append(Gain(gain_db=gain_db))
757
+
758
+ board = Pedalboard(effects)
759
+
760
+ with AudioFile(input_file) as f:
761
+ with AudioFile(output_file, 'w', f.samplerate, f.num_channels) as o:
762
+ # Read one second of audio at a time, until the file is empty:
763
+ while f.tell() < f.frames:
764
+ chunk = f.read(int(f.samplerate))
765
+ effected = board(chunk, f.samplerate, reset=False)
766
+ o.write(effected)
767
+
768
+
769
+ def sound_separate(media_file, stem, main, dereverb, vocal_effects=True, background_effects=True,
770
+ vocal_reverb_room_size=0.6, vocal_reverb_damping=0.6, vocal_reverb_wet_level=0.35,
771
+ vocal_delay_seconds=0.4, vocal_delay_mix=0.25,
772
+ vocal_compressor_threshold_db=-25, vocal_compressor_ratio=3.5, vocal_compressor_attack_ms=10, vocal_compressor_release_ms=60,
773
+ vocal_gain_db=4,
774
+ background_highpass_freq=120, background_lowpass_freq=11000,
775
+ background_reverb_room_size=0.5, background_reverb_damping=0.5, background_reverb_wet_level=0.25,
776
+ background_compressor_threshold_db=-20, background_compressor_ratio=2.5, background_compressor_attack_ms=15, background_compressor_release_ms=80,
777
+ background_gain_db=3):
778
+ if not media_file:
779
+ raise ValueError("The audio path is missing.")
780
+
781
+ if not stem:
782
+ raise ValueError("Please select 'vocal' or 'background' stem.")
783
+
784
+ hash_audio = str(get_hash(media_file))
785
+ media_dir = os.path.dirname(media_file)
786
+
787
+ outputs = []
788
+
789
+ start_time = time.time()
790
+
791
+ if stem == "vocal":
792
+ try:
793
+ _, _, _, _, vocal_audio = process_uvr_task(
794
+ orig_song_path=media_file,
795
+ song_id=hash_audio + "mdx",
796
+ main_vocals=main,
797
+ dereverb=dereverb,
798
+ remove_files_output_dir=False,
799
+ )
800
+
801
+ if vocal_effects:
802
+ suffix = '_effects'
803
+ file_name, file_extension = os.path.splitext(vocal_audio)
804
+ out_effects = file_name + suffix + file_extension
805
+ out_effects_path = os.path.join(media_dir, out_effects)
806
+ add_vocal_effects(vocal_audio, out_effects_path,
807
+ reverb_room_size=vocal_reverb_room_size, reverb_damping=vocal_reverb_damping, reverb_wet_level=vocal_reverb_wet_level,
808
+ delay_seconds=vocal_delay_seconds, delay_mix=vocal_delay_mix,
809
+ compressor_threshold_db=vocal_compressor_threshold_db, compressor_ratio=vocal_compressor_ratio, compressor_attack_ms=vocal_compressor_attack_ms, compressor_release_ms=vocal_compressor_release_ms,
810
+ gain_db=vocal_gain_db
811
+ )
812
+ vocal_audio = out_effects_path
813
+
814
+ outputs.append(vocal_audio)
815
+ except Exception as error:
816
+ logger.error(str(error))
817
+ traceback.print_exc()
818
+
819
+ if stem == "background":
820
+ background_audio, _ = process_uvr_task(
821
+ orig_song_path=media_file,
822
+ song_id=hash_audio + "voiceless",
823
+ only_voiceless=True,
824
+ remove_files_output_dir=False,
825
+ )
826
+
827
+ if background_effects:
828
+ suffix = '_effects'
829
+ file_name, file_extension = os.path.splitext(background_audio)
830
+ out_effects = file_name + suffix + file_extension
831
+ out_effects_path = os.path.join(media_dir, out_effects)
832
+ add_instrumental_effects(background_audio, out_effects_path,
833
+ highpass_freq=background_highpass_freq, lowpass_freq=background_lowpass_freq,
834
+ reverb_room_size=background_reverb_room_size, reverb_damping=background_reverb_damping, reverb_wet_level=background_reverb_wet_level,
835
+ compressor_threshold_db=background_compressor_threshold_db, compressor_ratio=background_compressor_ratio, compressor_attack_ms=background_compressor_attack_ms, compressor_release_ms=background_compressor_release_ms,
836
+ gain_db=background_gain_db
837
+ )
838
+ background_audio = out_effects_path
839
+
840
+ outputs.append(background_audio)
841
+
842
+ end_time = time.time()
843
+ execution_time = end_time - start_time
844
+ logger.info(f"Execution time: {execution_time} seconds")
845
+
846
+ if not outputs:
847
+ raise Exception("Error in sound separation.")
848
+
849
+ return outputs
850
+
851
+
852
+ def sound_separate(media_file, stem, main, dereverb, vocal_effects=True, background_effects=True,
853
+ vocal_reverb_room_size=0.6, vocal_reverb_damping=0.6, vocal_reverb_dryness=0.8 ,vocal_reverb_wet_level=0.35,
854
+ vocal_delay_seconds=0.4, vocal_delay_mix=0.25,
855
+ vocal_compressor_threshold_db=-25, vocal_compressor_ratio=3.5, vocal_compressor_attack_ms=10, vocal_compressor_release_ms=60,
856
+ vocal_gain_db=4,
857
+ background_highpass_freq=120, background_lowpass_freq=11000,
858
+ background_reverb_room_size=0.5, background_reverb_damping=0.5, background_reverb_wet_level=0.25,
859
+ background_compressor_threshold_db=-20, background_compressor_ratio=2.5, background_compressor_attack_ms=15, background_compressor_release_ms=80,
860
+ background_gain_db=3):
861
+ if not media_file:
862
+ raise ValueError("The audio path is missing.")
863
+
864
+ if not stem:
865
+ raise ValueError("Please select 'vocal' or 'background' stem.")
866
+
867
+ hash_audio = str(get_hash(media_file))
868
+ media_dir = os.path.dirname(media_file)
869
+
870
+ outputs = []
871
+
872
+ start_time = time.time()
873
+
874
+ if stem == "vocal":
875
+ try:
876
+ _, _, _, _, vocal_audio = process_uvr_task(
877
+ orig_song_path=media_file,
878
+ song_id=hash_audio + "mdx",
879
+ main_vocals=main,
880
+ dereverb=dereverb,
881
+ remove_files_output_dir=False,
882
+ )
883
+
884
+ if vocal_effects:
885
+ suffix = '_effects'
886
+ file_name, file_extension = os.path.splitext(os.path.abspath(vocal_audio))
887
+ out_effects = file_name + suffix + file_extension
888
+ out_effects_path = os.path.join(media_dir, out_effects)
889
+ add_vocal_effects(vocal_audio, out_effects_path,
890
+ reverb_room_size=vocal_reverb_room_size, reverb_damping=vocal_reverb_damping, vocal_reverb_dryness=vocal_reverb_dryness, reverb_wet_level=vocal_reverb_wet_level,
891
+ delay_seconds=vocal_delay_seconds, delay_mix=vocal_delay_mix,
892
+ compressor_threshold_db=vocal_compressor_threshold_db, compressor_ratio=vocal_compressor_ratio, compressor_attack_ms=vocal_compressor_attack_ms, compressor_release_ms=vocal_compressor_release_ms,
893
+ gain_db=vocal_gain_db
894
+ )
895
+ vocal_audio = out_effects_path
896
+
897
+ outputs.append(vocal_audio)
898
+ except Exception as error:
899
+ logger.error(str(error))
900
+
901
+ if stem == "background":
902
+ background_audio, _ = process_uvr_task(
903
+ orig_song_path=media_file,
904
+ song_id=hash_audio + "voiceless",
905
+ only_voiceless=True,
906
+ remove_files_output_dir=False,
907
+ )
908
+
909
+ if background_effects:
910
+ suffix = '_effects'
911
+ file_name, file_extension = os.path.splitext(os.path.abspath(background_audio))
912
+ out_effects = file_name + suffix + file_extension
913
+ out_effects_path = os.path.join(media_dir, out_effects)
914
+ print(file_name, file_extension, out_effects, out_effects_path)
915
+ add_instrumental_effects(background_audio, out_effects_path,
916
+ highpass_freq=background_highpass_freq, lowpass_freq=background_lowpass_freq,
917
+ reverb_room_size=background_reverb_room_size, reverb_damping=background_reverb_damping, reverb_wet_level=background_reverb_wet_level,
918
+ compressor_threshold_db=background_compressor_threshold_db, compressor_ratio=background_compressor_ratio, compressor_attack_ms=background_compressor_attack_ms, compressor_release_ms=background_compressor_release_ms,
919
+ gain_db=background_gain_db
920
+ )
921
+ background_audio = out_effects_path
922
+
923
+ outputs.append(background_audio)
924
+
925
+ end_time = time.time()
926
+ execution_time = end_time - start_time
927
+ logger.info(f"Execution time: {execution_time} seconds")
928
+
929
+ if not outputs:
930
+ raise Exception("Error in sound separation.")
931
+
932
+ return outputs
933
+
934
+
935
+ def audio_downloader(
936
+ url_media,
937
+ ):
938
+
939
+ url_media = url_media.strip()
940
+
941
+ if not url_media:
942
+ return None
943
+
944
+ print(url_media[:10])
945
+
946
+ dir_output_downloads = "downloads"
947
+ os.makedirs(dir_output_downloads, exist_ok=True)
948
+
949
+ media_info = yt_dlp.YoutubeDL(
950
+ {"quiet": True, "no_warnings": True, "noplaylist": True}
951
+ ).extract_info(url_media, download=False)
952
+ download_path = f"{os.path.join(dir_output_downloads, media_info['title'])}.m4a"
953
+
954
+ ydl_opts = {
955
+ 'format': 'm4a/bestaudio/best',
956
+ 'postprocessors': [{ # Extract audio using ffmpeg
957
+ 'key': 'FFmpegExtractAudio',
958
+ 'preferredcodec': 'm4a',
959
+ }],
960
+ 'force_overwrites': True,
961
+ 'noplaylist': True,
962
+ 'no_warnings': True,
963
+ 'quiet': True,
964
+ 'ignore_no_formats_error': True,
965
+ 'restrictfilenames': True,
966
+ 'outtmpl': download_path,
967
+ }
968
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl_download:
969
+ ydl_download.download([url_media])
970
+
971
+ return download_path
972
+
973
+
974
+ def downloader_conf():
975
+ return gr.Checkbox(
976
+ False,
977
+ label="URL-to-Audio",
978
+ # info="",
979
+ container=False,
980
+ )
981
+
982
+
983
+ def url_media_conf():
984
+ return gr.Textbox(
985
+ value="",
986
+ label="Enter URL",
987
+ placeholder="www.youtube.com/watch?v=g_9rPvbENUw",
988
+ visible=False,
989
+ lines=1,
990
+ )
991
+
992
+
993
+ def url_button_conf():
994
+ return gr.Button(
995
+ "Go",
996
+ variant="secondary",
997
+ visible=False,
998
+ )
999
+
1000
+
1001
+ def show_components_downloader(value_active):
1002
+ return gr.update(
1003
+ visible=value_active
1004
+ ), gr.update(
1005
+ visible=value_active
1006
+ )
1007
+
1008
+
1009
+ def audio_conf():
1010
+ return gr.File(
1011
+ label="Audio file",
1012
+ # file_count="multiple",
1013
+ type="filepath",
1014
+ container=True,
1015
+ )
1016
+
1017
+
1018
+ def stem_conf():
1019
+ return gr.Radio(
1020
+ choices=["vocal", "background"],
1021
+ value="vocal",
1022
+ label="Stem",
1023
+ # info="",
1024
+ )
1025
+
1026
+
1027
+ def main_conf():
1028
+ return gr.Checkbox(
1029
+ False,
1030
+ label="Main",
1031
+ # info="",
1032
+ )
1033
+
1034
+
1035
+ def dereverb_conf():
1036
+ return gr.Checkbox(
1037
+ False,
1038
+ label="Dereverb",
1039
+ # info="",
1040
+ visible=True,
1041
+ )
1042
+
1043
+
1044
+ def vocal_effects_conf():
1045
+ return gr.Checkbox(
1046
+ False,
1047
+ label="Vocal Effects",
1048
+ # info="",
1049
+ visible=True,
1050
+ )
1051
+
1052
+
1053
+ def background_effects_conf():
1054
+ return gr.Checkbox(
1055
+ False,
1056
+ label="Background Effects",
1057
+ # info="",
1058
+ visible=False,
1059
+ )
1060
+
1061
+
1062
+ def vocal_reverb_room_size_conf():
1063
+ return gr.Number(
1064
+ 0.15,
1065
+ label="Vocal Reverb Room Size",
1066
+ minimum=0.0,
1067
+ maximum=1.0,
1068
+ step=0.05,
1069
+ visible=True,
1070
+ )
1071
+
1072
+
1073
+ def vocal_reverb_damping_conf():
1074
+ return gr.Number(
1075
+ 0.7,
1076
+ label="Vocal Reverb Damping",
1077
+ minimum=0.0,
1078
+ maximum=1.0,
1079
+ step=0.01,
1080
+ visible=True,
1081
+ )
1082
+
1083
+
1084
+ def vocal_reverb_wet_level_conf():
1085
+ return gr.Number(
1086
+ 0.2,
1087
+ label="Vocal Reverb Wet Level",
1088
+ minimum=0.0,
1089
+ maximum=1.0,
1090
+ step=0.05,
1091
+ visible=True,
1092
+ )
1093
+
1094
+
1095
+ def vocal_reverb_dryness_level_conf():
1096
+ return gr.Number(
1097
+ 0.8,
1098
+ label="Vocal Reverb Dryness Level",
1099
+ minimum=0.0,
1100
+ maximum=1.0,
1101
+ step=0.05,
1102
+ visible=True,
1103
+ )
1104
+
1105
+
1106
+ def vocal_delay_seconds_conf():
1107
+ return gr.Number(
1108
+ 0.,
1109
+ label="Vocal Delay Seconds",
1110
+ minimum=0.0,
1111
+ maximum=1.0,
1112
+ step=0.01,
1113
+ visible=True,
1114
+ )
1115
+
1116
+
1117
+ def vocal_delay_mix_conf():
1118
+ return gr.Number(
1119
+ 0.,
1120
+ label="Vocal Delay Mix",
1121
+ minimum=0.0,
1122
+ maximum=1.0,
1123
+ step=0.01,
1124
+ visible=True,
1125
+ )
1126
+
1127
+
1128
+ def vocal_compressor_threshold_db_conf():
1129
+ return gr.Number(
1130
+ -15,
1131
+ label="Vocal Compressor Threshold (dB)",
1132
+ minimum=-60,
1133
+ maximum=0,
1134
+ step=1,
1135
+ visible=True,
1136
+ )
1137
+
1138
+
1139
+ def vocal_compressor_ratio_conf():
1140
+ return gr.Number(
1141
+ 4.,
1142
+ label="Vocal Compressor Ratio",
1143
+ minimum=0,
1144
+ maximum=20,
1145
+ step=0.1,
1146
+ visible=True,
1147
+ )
1148
+
1149
+
1150
+ def vocal_compressor_attack_ms_conf():
1151
+ return gr.Number(
1152
+ 1.0,
1153
+ label="Vocal Compressor Attack (ms)",
1154
+ minimum=0,
1155
+ maximum=1000,
1156
+ step=1,
1157
+ visible=True,
1158
+ )
1159
+
1160
+
1161
+ def vocal_compressor_release_ms_conf():
1162
+ return gr.Number(
1163
+ 100,
1164
+ label="Vocal Compressor Release (ms)",
1165
+ minimum=0,
1166
+ maximum=3000,
1167
+ step=1,
1168
+ visible=True,
1169
+ )
1170
+
1171
+
1172
+ def vocal_gain_db_conf():
1173
+ return gr.Number(
1174
+ 0,
1175
+ label="Vocal Gain (dB)",
1176
+ minimum=-40,
1177
+ maximum=40,
1178
+ step=1,
1179
+ visible=True,
1180
+ )
1181
+
1182
+
1183
+ def background_highpass_freq_conf():
1184
+ return gr.Number(
1185
+ 120,
1186
+ label="Background Highpass Frequency (Hz)",
1187
+ minimum=0,
1188
+ maximum=1000,
1189
+ step=1,
1190
+ visible=True,
1191
+ )
1192
+
1193
+
1194
+ def background_lowpass_freq_conf():
1195
+ return gr.Number(
1196
+ 11000,
1197
+ label="Background Lowpass Frequency (Hz)",
1198
+ minimum=0,
1199
+ maximum=20000,
1200
+ step=1,
1201
+ visible=True,
1202
+ )
1203
+
1204
+
1205
+ def background_reverb_room_size_conf():
1206
+ return gr.Number(
1207
+ 0.1,
1208
+ label="Background Reverb Room Size",
1209
+ minimum=0.0,
1210
+ maximum=1.0,
1211
+ step=0.1,
1212
+ visible=True,
1213
+ )
1214
+
1215
+
1216
+ def background_reverb_damping_conf():
1217
+ return gr.Number(
1218
+ 0.5,
1219
+ label="Background Reverb Damping",
1220
+ minimum=0.0,
1221
+ maximum=1.0,
1222
+ step=0.1,
1223
+ visible=True,
1224
+ )
1225
+
1226
+
1227
+ def background_reverb_wet_level_conf():
1228
+ return gr.Number(
1229
+ 0.25,
1230
+ label="Background Reverb Wet Level",
1231
+ minimum=0.0,
1232
+ maximum=1.0,
1233
+ step=0.05,
1234
+ visible=True,
1235
+ )
1236
+
1237
+
1238
+ def background_compressor_threshold_db_conf():
1239
+ return gr.Number(
1240
+ -15,
1241
+ label="Background Compressor Threshold (dB)",
1242
+ minimum=-60,
1243
+ maximum=0,
1244
+ step=1,
1245
+ visible=True,
1246
+ )
1247
+
1248
+
1249
+ def background_compressor_ratio_conf():
1250
+ return gr.Number(
1251
+ 4.,
1252
+ label="Background Compressor Ratio",
1253
+ minimum=0,
1254
+ maximum=20,
1255
+ step=0.1,
1256
+ visible=True,
1257
+ )
1258
+
1259
+
1260
+ def background_compressor_attack_ms_conf():
1261
+ return gr.Number(
1262
+ 15,
1263
+ label="Background Compressor Attack (ms)",
1264
+ minimum=0,
1265
+ maximum=1000,
1266
+ step=1,
1267
+ visible=True,
1268
+ )
1269
+
1270
+
1271
+ def background_compressor_release_ms_conf():
1272
+ return gr.Number(
1273
+ 60,
1274
+ label="Background Compressor Release (ms)",
1275
+ minimum=0,
1276
+ maximum=3000,
1277
+ step=1,
1278
+ visible=True,
1279
+ )
1280
+
1281
+
1282
+ def background_gain_db_conf():
1283
+ return gr.Number(
1284
+ 0,
1285
+ label="Background Gain (dB)",
1286
+ minimum=-40,
1287
+ maximum=40,
1288
+ step=1,
1289
+ visible=True,
1290
+ )
1291
+
1292
+
1293
+ def button_conf():
1294
+ return gr.Button(
1295
+ "Inference",
1296
+ variant="primary",
1297
+ )
1298
+
1299
+
1300
+ def output_conf():
1301
+ return gr.File(
1302
+ label="Result",
1303
+ file_count="multiple",
1304
+ interactive=False,
1305
+ )
1306
+
1307
+
1308
+ def show_vocal_components(value_name):
1309
+
1310
+ if value_name == "vocal":
1311
+ return gr.update(visible=True), gr.update(
1312
+ visible=True
1313
+ ), gr.update(visible=True), gr.update(
1314
+ visible=False
1315
+ )
1316
+ else:
1317
+ return gr.update(visible=False), gr.update(
1318
+ visible=False
1319
+ ), gr.update(visible=False), gr.update(
1320
+ visible=True
1321
+ )
1322
+
1323
+
1324
+ def get_gui(theme):
1325
+ with gr.Blocks(theme=theme) as app:
1326
+ gr.Markdown(title)
1327
+ gr.Markdown(description)
1328
+
1329
+ downloader_gui = downloader_conf()
1330
+ with gr.Row():
1331
+ with gr.Column(scale=2):
1332
+ url_media_gui = url_media_conf()
1333
+ with gr.Column(scale=1):
1334
+ url_button_gui = url_button_conf()
1335
+
1336
+ downloader_gui.change(
1337
+ show_components_downloader,
1338
+ [downloader_gui],
1339
+ [url_media_gui, url_button_gui]
1340
+ )
1341
+
1342
+ aud = audio_conf()
1343
+
1344
+ url_button_gui.click(
1345
+ audio_downloader,
1346
+ [url_media_gui],
1347
+ [aud]
1348
+ )
1349
+
1350
+ with gr.Column():
1351
+ with gr.Row():
1352
+ stem_gui = stem_conf()
1353
+
1354
+ with gr.Column():
1355
+ with gr.Row():
1356
+ main_gui = main_conf()
1357
+ dereverb_gui = dereverb_conf()
1358
+ vocal_effects_gui = vocal_effects_conf()
1359
+ background_effects_gui = background_effects_conf()
1360
+
1361
+ # with gr.Column():
1362
+ with gr.Accordion("Vocal Effects Parameters", open=False): # with gr.Row():
1363
+ # gr.Label("Vocal Effects Parameters")
1364
+ with gr.Row():
1365
+ vocal_reverb_room_size_gui = vocal_reverb_room_size_conf()
1366
+ vocal_reverb_damping_gui = vocal_reverb_damping_conf()
1367
+ vocal_reverb_dryness_gui = vocal_reverb_dryness_level_conf()
1368
+ vocal_reverb_wet_level_gui = vocal_reverb_wet_level_conf()
1369
+ vocal_delay_seconds_gui = vocal_delay_seconds_conf()
1370
+ vocal_delay_mix_gui = vocal_delay_mix_conf()
1371
+ vocal_compressor_threshold_db_gui = vocal_compressor_threshold_db_conf()
1372
+ vocal_compressor_ratio_gui = vocal_compressor_ratio_conf()
1373
+ vocal_compressor_attack_ms_gui = vocal_compressor_attack_ms_conf()
1374
+ vocal_compressor_release_ms_gui = vocal_compressor_release_ms_conf()
1375
+ vocal_gain_db_gui = vocal_gain_db_conf()
1376
+
1377
+ with gr.Accordion("Background Effects Parameters", open=False): # with gr.Row():
1378
+ # gr.Label("Background Effects Parameters")
1379
+ with gr.Row():
1380
+ background_highpass_freq_gui = background_highpass_freq_conf()
1381
+ background_lowpass_freq_gui = background_lowpass_freq_conf()
1382
+ background_reverb_room_size_gui = background_reverb_room_size_conf()
1383
+ background_reverb_damping_gui = background_reverb_damping_conf()
1384
+ background_reverb_wet_level_gui = background_reverb_wet_level_conf()
1385
+ background_compressor_threshold_db_gui = background_compressor_threshold_db_conf()
1386
+ background_compressor_ratio_gui = background_compressor_ratio_conf()
1387
+ background_compressor_attack_ms_gui = background_compressor_attack_ms_conf()
1388
+ background_compressor_release_ms_gui = background_compressor_release_ms_conf()
1389
+ background_gain_db_gui = background_gain_db_conf()
1390
+
1391
+ stem_gui.change(
1392
+ show_vocal_components,
1393
+ [stem_gui],
1394
+ [main_gui, dereverb_gui, vocal_effects_gui, background_effects_gui],
1395
+ )
1396
+
1397
+ button_base = button_conf()
1398
+ output_base = output_conf()
1399
+
1400
+ button_base.click(
1401
+ sound_separate,
1402
+ inputs=[
1403
+ aud,
1404
+ stem_gui,
1405
+ main_gui,
1406
+ dereverb_gui,
1407
+ vocal_effects_gui,
1408
+ background_effects_gui,
1409
+ vocal_reverb_room_size_gui, vocal_reverb_damping_gui, vocal_reverb_dryness_gui, vocal_reverb_wet_level_gui,
1410
+ vocal_delay_seconds_gui, vocal_delay_mix_gui, vocal_compressor_threshold_db_gui, vocal_compressor_ratio_gui,
1411
+ vocal_compressor_attack_ms_gui, vocal_compressor_release_ms_gui, vocal_gain_db_gui,
1412
+ background_highpass_freq_gui, background_lowpass_freq_gui, background_reverb_room_size_gui,
1413
+ background_reverb_damping_gui, background_reverb_wet_level_gui, background_compressor_threshold_db_gui,
1414
+ background_compressor_ratio_gui, background_compressor_attack_ms_gui, background_compressor_release_ms_gui,
1415
+ background_gain_db_gui,
1416
+ ],
1417
+ outputs=[output_base],
1418
+ )
1419
+
1420
+ gr.Examples(
1421
+ examples=[
1422
+ [
1423
+ "./test.mp3",
1424
+ "vocal",
1425
+ False,
1426
+ False,
1427
+ False,
1428
+ False,
1429
+ 0.15, 0.7, 0.8, 0.2,
1430
+ 0., 0., -15, 4., 1, 100, 0,
1431
+ 120, 11000, 0.5, 0.1, 0.25, -15, 4., 15, 60, 0,
1432
+ ],
1433
+ ],
1434
+ fn=sound_separate,
1435
+ inputs=[
1436
+ aud,
1437
+ stem_gui,
1438
+ main_gui,
1439
+ dereverb_gui,
1440
+ vocal_effects_gui,
1441
+ background_effects_gui,
1442
+ vocal_reverb_room_size_gui, vocal_reverb_damping_gui, vocal_reverb_dryness_gui, vocal_reverb_wet_level_gui,
1443
+ vocal_delay_seconds_gui, vocal_delay_mix_gui, vocal_compressor_threshold_db_gui, vocal_compressor_ratio_gui,
1444
+ vocal_compressor_attack_ms_gui, vocal_compressor_release_ms_gui, vocal_gain_db_gui,
1445
+ background_highpass_freq_gui, background_lowpass_freq_gui, background_reverb_room_size_gui,
1446
+ background_reverb_damping_gui, background_reverb_wet_level_gui, background_compressor_threshold_db_gui,
1447
+ background_compressor_ratio_gui, background_compressor_attack_ms_gui, background_compressor_release_ms_gui,
1448
+ background_gain_db_gui,
1449
+ ],
1450
+ outputs=[output_base],
1451
+ cache_examples=False,
1452
+ )
1453
+
1454
+ return app
1455
+
1456
+
1457
+ if __name__ == "__main__":
1458
+
1459
+ for id_model in UVR_MODELS:
1460
+ download_manager(
1461
+ os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
1462
+ )
1463
+
1464
+ app = get_gui(theme)
1465
+
1466
+ app.queue(default_concurrency_limit=40)
1467
+
1468
+ app.launch(
1469
+ max_threads=40,
1470
+ share=True,
1471
+ show_error=True,
1472
+ quiet=False,
1473
+ debug=False,
1474
+ )
mdx_models/data.json ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0ddfc0eb5792638ad5dc27850236c246": {
3
+ "compensate": 1.035,
4
+ "mdx_dim_f_set": 2048,
5
+ "mdx_dim_t_set": 8,
6
+ "mdx_n_fft_scale_set": 6144,
7
+ "primary_stem": "Vocals"
8
+ },
9
+ "26d308f91f3423a67dc69a6d12a8793d": {
10
+ "compensate": 1.035,
11
+ "mdx_dim_f_set": 2048,
12
+ "mdx_dim_t_set": 9,
13
+ "mdx_n_fft_scale_set": 8192,
14
+ "primary_stem": "Other"
15
+ },
16
+ "2cdd429caac38f0194b133884160f2c6": {
17
+ "compensate": 1.045,
18
+ "mdx_dim_f_set": 3072,
19
+ "mdx_dim_t_set": 8,
20
+ "mdx_n_fft_scale_set": 7680,
21
+ "primary_stem": "Instrumental"
22
+ },
23
+ "2f5501189a2f6db6349916fabe8c90de": {
24
+ "compensate": 1.035,
25
+ "mdx_dim_f_set": 2048,
26
+ "mdx_dim_t_set": 8,
27
+ "mdx_n_fft_scale_set": 6144,
28
+ "primary_stem": "Vocals"
29
+ },
30
+ "398580b6d5d973af3120df54cee6759d": {
31
+ "compensate": 1.75,
32
+ "mdx_dim_f_set": 3072,
33
+ "mdx_dim_t_set": 8,
34
+ "mdx_n_fft_scale_set": 7680,
35
+ "primary_stem": "Vocals"
36
+ },
37
+ "488b3e6f8bd3717d9d7c428476be2d75": {
38
+ "compensate": 1.035,
39
+ "mdx_dim_f_set": 3072,
40
+ "mdx_dim_t_set": 8,
41
+ "mdx_n_fft_scale_set": 7680,
42
+ "primary_stem": "Instrumental"
43
+ },
44
+ "4910e7827f335048bdac11fa967772f9": {
45
+ "compensate": 1.035,
46
+ "mdx_dim_f_set": 2048,
47
+ "mdx_dim_t_set": 7,
48
+ "mdx_n_fft_scale_set": 4096,
49
+ "primary_stem": "Drums"
50
+ },
51
+ "53c4baf4d12c3e6c3831bb8f5b532b93": {
52
+ "compensate": 1.043,
53
+ "mdx_dim_f_set": 3072,
54
+ "mdx_dim_t_set": 8,
55
+ "mdx_n_fft_scale_set": 7680,
56
+ "primary_stem": "Vocals"
57
+ },
58
+ "5d343409ef0df48c7d78cce9f0106781": {
59
+ "compensate": 1.075,
60
+ "mdx_dim_f_set": 3072,
61
+ "mdx_dim_t_set": 8,
62
+ "mdx_n_fft_scale_set": 7680,
63
+ "primary_stem": "Vocals"
64
+ },
65
+ "5f6483271e1efb9bfb59e4a3e6d4d098": {
66
+ "compensate": 1.035,
67
+ "mdx_dim_f_set": 2048,
68
+ "mdx_dim_t_set": 9,
69
+ "mdx_n_fft_scale_set": 6144,
70
+ "primary_stem": "Vocals"
71
+ },
72
+ "65ab5919372a128e4167f5e01a8fda85": {
73
+ "compensate": 1.035,
74
+ "mdx_dim_f_set": 2048,
75
+ "mdx_dim_t_set": 8,
76
+ "mdx_n_fft_scale_set": 8192,
77
+ "primary_stem": "Other"
78
+ },
79
+ "6703e39f36f18aa7855ee1047765621d": {
80
+ "compensate": 1.035,
81
+ "mdx_dim_f_set": 2048,
82
+ "mdx_dim_t_set": 9,
83
+ "mdx_n_fft_scale_set": 16384,
84
+ "primary_stem": "Bass"
85
+ },
86
+ "6b31de20e84392859a3d09d43f089515": {
87
+ "compensate": 1.035,
88
+ "mdx_dim_f_set": 2048,
89
+ "mdx_dim_t_set": 8,
90
+ "mdx_n_fft_scale_set": 6144,
91
+ "primary_stem": "Vocals"
92
+ },
93
+ "867595e9de46f6ab699008295df62798": {
94
+ "compensate": 1.03,
95
+ "mdx_dim_f_set": 3072,
96
+ "mdx_dim_t_set": 8,
97
+ "mdx_n_fft_scale_set": 7680,
98
+ "primary_stem": "Vocals"
99
+ },
100
+ "a3cd63058945e777505c01d2507daf37": {
101
+ "compensate": 1.03,
102
+ "mdx_dim_f_set": 2048,
103
+ "mdx_dim_t_set": 8,
104
+ "mdx_n_fft_scale_set": 6144,
105
+ "primary_stem": "Vocals"
106
+ },
107
+ "b33d9b3950b6cbf5fe90a32608924700": {
108
+ "compensate": 1.03,
109
+ "mdx_dim_f_set": 3072,
110
+ "mdx_dim_t_set": 8,
111
+ "mdx_n_fft_scale_set": 7680,
112
+ "primary_stem": "Vocals"
113
+ },
114
+ "c3b29bdce8c4fa17ec609e16220330ab": {
115
+ "compensate": 1.035,
116
+ "mdx_dim_f_set": 2048,
117
+ "mdx_dim_t_set": 8,
118
+ "mdx_n_fft_scale_set": 16384,
119
+ "primary_stem": "Bass"
120
+ },
121
+ "ceed671467c1f64ebdfac8a2490d0d52": {
122
+ "compensate": 1.035,
123
+ "mdx_dim_f_set": 3072,
124
+ "mdx_dim_t_set": 8,
125
+ "mdx_n_fft_scale_set": 7680,
126
+ "primary_stem": "Instrumental"
127
+ },
128
+ "d2a1376f310e4f7fa37fb9b5774eb701": {
129
+ "compensate": 1.035,
130
+ "mdx_dim_f_set": 3072,
131
+ "mdx_dim_t_set": 8,
132
+ "mdx_n_fft_scale_set": 7680,
133
+ "primary_stem": "Instrumental"
134
+ },
135
+ "d7bff498db9324db933d913388cba6be": {
136
+ "compensate": 1.035,
137
+ "mdx_dim_f_set": 2048,
138
+ "mdx_dim_t_set": 8,
139
+ "mdx_n_fft_scale_set": 6144,
140
+ "primary_stem": "Vocals"
141
+ },
142
+ "d94058f8c7f1fae4164868ae8ae66b20": {
143
+ "compensate": 1.035,
144
+ "mdx_dim_f_set": 2048,
145
+ "mdx_dim_t_set": 8,
146
+ "mdx_n_fft_scale_set": 6144,
147
+ "primary_stem": "Vocals"
148
+ },
149
+ "dc41ede5961d50f277eb846db17f5319": {
150
+ "compensate": 1.035,
151
+ "mdx_dim_f_set": 2048,
152
+ "mdx_dim_t_set": 9,
153
+ "mdx_n_fft_scale_set": 4096,
154
+ "primary_stem": "Drums"
155
+ },
156
+ "e5572e58abf111f80d8241d2e44e7fa4": {
157
+ "compensate": 1.028,
158
+ "mdx_dim_f_set": 3072,
159
+ "mdx_dim_t_set": 8,
160
+ "mdx_n_fft_scale_set": 7680,
161
+ "primary_stem": "Instrumental"
162
+ },
163
+ "e7324c873b1f615c35c1967f912db92a": {
164
+ "compensate": 1.03,
165
+ "mdx_dim_f_set": 3072,
166
+ "mdx_dim_t_set": 8,
167
+ "mdx_n_fft_scale_set": 7680,
168
+ "primary_stem": "Vocals"
169
+ },
170
+ "1c56ec0224f1d559c42fd6fd2a67b154": {
171
+ "compensate": 1.025,
172
+ "mdx_dim_f_set": 2048,
173
+ "mdx_dim_t_set": 8,
174
+ "mdx_n_fft_scale_set": 5120,
175
+ "primary_stem": "Instrumental"
176
+ },
177
+ "f2df6d6863d8f435436d8b561594ff49": {
178
+ "compensate": 1.035,
179
+ "mdx_dim_f_set": 3072,
180
+ "mdx_dim_t_set": 8,
181
+ "mdx_n_fft_scale_set": 7680,
182
+ "primary_stem": "Instrumental"
183
+ },
184
+ "b06327a00d5e5fbc7d96e1781bbdb596": {
185
+ "compensate": 1.035,
186
+ "mdx_dim_f_set": 3072,
187
+ "mdx_dim_t_set": 8,
188
+ "mdx_n_fft_scale_set": 6144,
189
+ "primary_stem": "Instrumental"
190
+ },
191
+ "94ff780b977d3ca07c7a343dab2e25dd": {
192
+ "compensate": 1.039,
193
+ "mdx_dim_f_set": 3072,
194
+ "mdx_dim_t_set": 8,
195
+ "mdx_n_fft_scale_set": 6144,
196
+ "primary_stem": "Instrumental"
197
+ },
198
+ "73492b58195c3b52d34590d5474452f6": {
199
+ "compensate": 1.043,
200
+ "mdx_dim_f_set": 3072,
201
+ "mdx_dim_t_set": 8,
202
+ "mdx_n_fft_scale_set": 7680,
203
+ "primary_stem": "Vocals"
204
+ },
205
+ "970b3f9492014d18fefeedfe4773cb42": {
206
+ "compensate": 1.009,
207
+ "mdx_dim_f_set": 3072,
208
+ "mdx_dim_t_set": 8,
209
+ "mdx_n_fft_scale_set": 7680,
210
+ "primary_stem": "Vocals"
211
+ },
212
+ "1d64a6d2c30f709b8c9b4ce1366d96ee": {
213
+ "compensate": 1.035,
214
+ "mdx_dim_f_set": 2048,
215
+ "mdx_dim_t_set": 8,
216
+ "mdx_n_fft_scale_set": 5120,
217
+ "primary_stem": "Instrumental"
218
+ },
219
+ "203f2a3955221b64df85a41af87cf8f0": {
220
+ "compensate": 1.035,
221
+ "mdx_dim_f_set": 3072,
222
+ "mdx_dim_t_set": 8,
223
+ "mdx_n_fft_scale_set": 6144,
224
+ "primary_stem": "Instrumental"
225
+ },
226
+ "291c2049608edb52648b96e27eb80e95": {
227
+ "compensate": 1.035,
228
+ "mdx_dim_f_set": 3072,
229
+ "mdx_dim_t_set": 8,
230
+ "mdx_n_fft_scale_set": 6144,
231
+ "primary_stem": "Instrumental"
232
+ },
233
+ "ead8d05dab12ec571d67549b3aab03fc": {
234
+ "compensate": 1.035,
235
+ "mdx_dim_f_set": 3072,
236
+ "mdx_dim_t_set": 8,
237
+ "mdx_n_fft_scale_set": 6144,
238
+ "primary_stem": "Instrumental"
239
+ },
240
+ "cc63408db3d80b4d85b0287d1d7c9632": {
241
+ "compensate": 1.033,
242
+ "mdx_dim_f_set": 3072,
243
+ "mdx_dim_t_set": 8,
244
+ "mdx_n_fft_scale_set": 6144,
245
+ "primary_stem": "Instrumental"
246
+ },
247
+ "cd5b2989ad863f116c855db1dfe24e39": {
248
+ "compensate": 1.035,
249
+ "mdx_dim_f_set": 3072,
250
+ "mdx_dim_t_set": 9,
251
+ "mdx_n_fft_scale_set": 6144,
252
+ "primary_stem": "Other"
253
+ },
254
+ "55657dd70583b0fedfba5f67df11d711": {
255
+ "compensate": 1.022,
256
+ "mdx_dim_f_set": 3072,
257
+ "mdx_dim_t_set": 8,
258
+ "mdx_n_fft_scale_set": 6144,
259
+ "primary_stem": "Instrumental"
260
+ },
261
+ "b6bccda408a436db8500083ef3491e8b": {
262
+ "compensate": 1.02,
263
+ "mdx_dim_f_set": 3072,
264
+ "mdx_dim_t_set": 8,
265
+ "mdx_n_fft_scale_set": 7680,
266
+ "primary_stem": "Instrumental"
267
+ },
268
+ "8a88db95c7fb5dbe6a095ff2ffb428b1": {
269
+ "compensate": 1.026,
270
+ "mdx_dim_f_set": 2048,
271
+ "mdx_dim_t_set": 8,
272
+ "mdx_n_fft_scale_set": 5120,
273
+ "primary_stem": "Instrumental"
274
+ },
275
+ "b78da4afc6512f98e4756f5977f5c6b9": {
276
+ "compensate": 1.021,
277
+ "mdx_dim_f_set": 3072,
278
+ "mdx_dim_t_set": 8,
279
+ "mdx_n_fft_scale_set": 7680,
280
+ "primary_stem": "Instrumental"
281
+ },
282
+ "77d07b2667ddf05b9e3175941b4454a0": {
283
+ "compensate": 1.021,
284
+ "mdx_dim_f_set": 3072,
285
+ "mdx_dim_t_set": 8,
286
+ "mdx_n_fft_scale_set": 7680,
287
+ "primary_stem": "Vocals"
288
+ },
289
+ "0f2a6bc5b49d87d64728ee40e23bceb1": {
290
+ "compensate": 1.019,
291
+ "mdx_dim_f_set": 2560,
292
+ "mdx_dim_t_set": 8,
293
+ "mdx_n_fft_scale_set": 5120,
294
+ "primary_stem": "Instrumental"
295
+ },
296
+ "b02be2d198d4968a121030cf8950b492": {
297
+ "compensate": 1.020,
298
+ "mdx_dim_f_set": 2560,
299
+ "mdx_dim_t_set": 8,
300
+ "mdx_n_fft_scale_set": 5120,
301
+ "primary_stem": "No Crowd"
302
+ },
303
+ "2154254ee89b2945b97a7efed6e88820": {
304
+ "config_yaml": "model_2_stem_061321.yaml"
305
+ },
306
+ "063aadd735d58150722926dcbf5852a9": {
307
+ "config_yaml": "model_2_stem_061321.yaml"
308
+ },
309
+ "fe96801369f6a148df2720f5ced88c19": {
310
+ "config_yaml": "model3.yaml"
311
+ },
312
+ "02e8b226f85fb566e5db894b9931c640": {
313
+ "config_yaml": "model2.yaml"
314
+ },
315
+ "e3de6d861635ab9c1d766149edd680d6": {
316
+ "config_yaml": "model1.yaml"
317
+ },
318
+ "3f2936c554ab73ce2e396d54636bd373": {
319
+ "config_yaml": "modelB.yaml"
320
+ },
321
+ "890d0f6f82d7574bca741a9e8bcb8168": {
322
+ "config_yaml": "modelB.yaml"
323
+ },
324
+ "63a3cb8c37c474681049be4ad1ba8815": {
325
+ "config_yaml": "modelB.yaml"
326
+ },
327
+ "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
328
+ "config_yaml": "modelA.yaml"
329
+ },
330
+ "3567f3dee6e77bf366fcb1c7b8bc3745": {
331
+ "config_yaml": "modelA.yaml"
332
+ },
333
+ "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
334
+ "config_yaml": "modelA.yaml"
335
+ },
336
+ "c9971a18da20911822593dc81caa8be9": {
337
+ "config_yaml": "sndfx.yaml"
338
+ },
339
+ "57d94d5ed705460d21c75a5ac829a605": {
340
+ "config_yaml": "sndfx.yaml"
341
+ },
342
+ "e7a25f8764f25a52c1b96c4946e66ba2": {
343
+ "config_yaml": "sndfx.yaml"
344
+ },
345
+ "104081d24e37217086ce5fde09147ee1": {
346
+ "config_yaml": "model_2_stem_061321.yaml"
347
+ },
348
+ "1e6165b601539f38d0a9330f3facffeb": {
349
+ "config_yaml": "model_2_stem_061321.yaml"
350
+ },
351
+ "fe0108464ce0d8271be5ab810891bd7c": {
352
+ "config_yaml": "model_2_stem_full_band.yaml"
353
+ }
354
+ }
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ soundfile
2
+ librosa
3
+ torch==2.2.0
4
+ pedalboard
5
+ yt-dlp
6
+ spaces
7
+ ffmpeg
test.mp3 ADDED
Binary file (193 kB). View file
 
utils.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, zipfile, shutil, subprocess, shlex, sys # noqa
2
+ from urllib.parse import urlparse
3
+ import re
4
+ import logging
5
+
6
+
7
+ def load_file_from_url(
8
+ url: str,
9
+ model_dir: str,
10
+ file_name: str | None = None,
11
+ overwrite: bool = False,
12
+ progress: bool = True,
13
+ ) -> str:
14
+ """Download a file from `url` into `model_dir`,
15
+ using the file present if possible.
16
+
17
+ Returns the path to the downloaded file.
18
+ """
19
+ os.makedirs(model_dir, exist_ok=True)
20
+ if not file_name:
21
+ parts = urlparse(url)
22
+ file_name = os.path.basename(parts.path)
23
+ cached_file = os.path.abspath(os.path.join(model_dir, file_name))
24
+
25
+ # Overwrite
26
+ if os.path.exists(cached_file):
27
+ if overwrite or os.path.getsize(cached_file) == 0:
28
+ remove_files(cached_file)
29
+
30
+ # Download
31
+ if not os.path.exists(cached_file):
32
+ logger.info(f'Downloading: "{url}" to {cached_file}\n')
33
+ from torch.hub import download_url_to_file
34
+
35
+ download_url_to_file(url, cached_file, progress=progress)
36
+ else:
37
+ logger.debug(cached_file)
38
+
39
+ return cached_file
40
+
41
+
42
+ def friendly_name(file: str):
43
+ if file.startswith("http"):
44
+ file = urlparse(file).path
45
+
46
+ file = os.path.basename(file)
47
+ model_name, extension = os.path.splitext(file)
48
+ return model_name, extension
49
+
50
+
51
+ def download_manager(
52
+ url: str,
53
+ path: str,
54
+ extension: str = "",
55
+ overwrite: bool = False,
56
+ progress: bool = True,
57
+ ):
58
+ url = url.strip()
59
+
60
+ name, ext = friendly_name(url)
61
+ name += ext if not extension else f".{extension}"
62
+
63
+ if url.startswith("http"):
64
+ filename = load_file_from_url(
65
+ url=url,
66
+ model_dir=path,
67
+ file_name=name,
68
+ overwrite=overwrite,
69
+ progress=progress,
70
+ )
71
+ else:
72
+ filename = path
73
+
74
+ return filename
75
+
76
+
77
+ def remove_files(file_list):
78
+ if isinstance(file_list, str):
79
+ file_list = [file_list]
80
+
81
+ for file in file_list:
82
+ if os.path.exists(file):
83
+ os.remove(file)
84
+
85
+
86
+ def remove_directory_contents(directory_path):
87
+ """
88
+ Removes all files and subdirectories within a directory.
89
+
90
+ Parameters:
91
+ directory_path (str): Path to the directory whose
92
+ contents need to be removed.
93
+ """
94
+ if os.path.exists(directory_path):
95
+ for filename in os.listdir(directory_path):
96
+ file_path = os.path.join(directory_path, filename)
97
+ try:
98
+ if os.path.isfile(file_path):
99
+ os.remove(file_path)
100
+ elif os.path.isdir(file_path):
101
+ shutil.rmtree(file_path)
102
+ except Exception as e:
103
+ logger.error(f"Failed to delete {file_path}. Reason: {e}")
104
+ logger.info(f"Content in '{directory_path}' removed.")
105
+ else:
106
+ logger.error(f"Directory '{directory_path}' does not exist.")
107
+
108
+
109
+ # Create directory if not exists
110
+ def create_directories(directory_path):
111
+ if isinstance(directory_path, str):
112
+ directory_path = [directory_path]
113
+ for one_dir_path in directory_path:
114
+ if not os.path.exists(one_dir_path):
115
+ os.makedirs(one_dir_path)
116
+ logger.debug(f"Directory '{one_dir_path}' created.")
117
+
118
+
119
+ def setup_logger(name_log):
120
+ logger = logging.getLogger(name_log)
121
+ logger.setLevel(logging.INFO)
122
+
123
+ _default_handler = logging.StreamHandler() # Set sys.stderr as stream.
124
+ _default_handler.flush = sys.stderr.flush
125
+ logger.addHandler(_default_handler)
126
+
127
+ logger.propagate = False
128
+
129
+ handlers = logger.handlers
130
+
131
+ for handler in handlers:
132
+ formatter = logging.Formatter("[%(levelname)s] >> %(message)s")
133
+ handler.setFormatter(formatter)
134
+
135
+ # logger.handlers
136
+
137
+ return logger
138
+
139
+
140
+ logger = setup_logger("ss")
141
+ logger.setLevel(logging.INFO)
142
+