Nguyen Ha Lan
commited on
Commit
·
d1199f2
1
Parent(s):
2498595
add postprocess voice
Browse files- .gitignore +4 -1
- __pycache__/openvoice.cpython-310.pyc +0 -0
- __pycache__/postprocess.cpython-310.pyc +0 -0
- app.py +18 -3
- checkpoints_v2/base_speakers/ses/en-au.pth +3 -0
- checkpoints_v2/base_speakers/ses/en-br.pth +3 -0
- checkpoints_v2/base_speakers/ses/en-default.pth +3 -0
- checkpoints_v2/base_speakers/ses/en-india.pth +3 -0
- checkpoints_v2/base_speakers/ses/en-newest.pth +3 -0
- checkpoints_v2/base_speakers/ses/en-us.pth +3 -0
- checkpoints_v2/base_speakers/ses/es.pth +3 -0
- checkpoints_v2/base_speakers/ses/fr.pth +3 -0
- checkpoints_v2/base_speakers/ses/jp.pth +3 -0
- checkpoints_v2/base_speakers/ses/kr.pth +3 -0
- checkpoints_v2/base_speakers/ses/zh.pth +3 -0
- checkpoints_v2/converter/checkpoint.pth +3 -0
- checkpoints_v2/converter/config.json +57 -0
- output.wav +0 -0
- postprocess.py +59 -0
- requirements.txt +7 -1
.gitignore
CHANGED
@@ -1 +1,4 @@
|
|
1 |
-
model
|
|
|
|
|
|
|
|
1 |
+
model
|
2 |
+
outputs
|
3 |
+
processed
|
4 |
+
checkpoints_v2_0417.zip
|
__pycache__/openvoice.cpython-310.pyc
ADDED
Binary file (1.94 kB). View file
|
|
__pycache__/postprocess.cpython-310.pyc
ADDED
Binary file (2.35 kB). View file
|
|
app.py
CHANGED
@@ -18,6 +18,10 @@ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
|
|
18 |
from TTS.tts.configs.xtts_config import XttsConfig
|
19 |
from TTS.tts.models.xtts import Xtts
|
20 |
from vinorm import TTSnorm
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# download for mecab
|
23 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
@@ -39,6 +43,7 @@ if not all(file in files_in_dir for file in required_files):
|
|
39 |
repo_id=repo_id,
|
40 |
repo_type="model",
|
41 |
local_dir=checkpoint_dir,
|
|
|
42 |
)
|
43 |
|
44 |
xtts_config = os.path.join(checkpoint_dir, "config.json")
|
@@ -52,6 +57,8 @@ MODEL.load_checkpoint(
|
|
52 |
if torch.cuda.is_available():
|
53 |
MODEL.cuda()
|
54 |
|
|
|
|
|
55 |
supported_languages = config.languages
|
56 |
if not "vi" in supported_languages:
|
57 |
supported_languages.append("vi")
|
@@ -71,7 +78,7 @@ def normalize_vietnamese_text(text):
|
|
71 |
.replace("A.I", "Ây Ai")
|
72 |
.replace("ad", "át")
|
73 |
.replace("marketing", "ma két tin")
|
74 |
-
.replace("tienziven", "
|
75 |
)
|
76 |
return text
|
77 |
|
@@ -171,7 +178,15 @@ def predict(
|
|
171 |
keep_len = calculate_keep_len(prompt, language)
|
172 |
out["wav"] = out["wav"][:keep_len]
|
173 |
|
174 |
-
torchaudio.save("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
except RuntimeError as e:
|
177 |
if "device-side assert" in str(e):
|
@@ -313,7 +328,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
313 |
)
|
314 |
normalize_text = gr.Checkbox(
|
315 |
label="Chuẩn hóa văn bản tiếng Việt",
|
316 |
-
info="
|
317 |
value=True,
|
318 |
)
|
319 |
ref_dropdown = gr.Dropdown(
|
|
|
18 |
from TTS.tts.configs.xtts_config import XttsConfig
|
19 |
from TTS.tts.models.xtts import Xtts
|
20 |
from vinorm import TTSnorm
|
21 |
+
from postprocess import Postprocessor
|
22 |
+
|
23 |
+
output_dir = 'outputs'
|
24 |
+
os.makedirs(output_dir, exist_ok=True)
|
25 |
|
26 |
# download for mecab
|
27 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
43 |
repo_id=repo_id,
|
44 |
repo_type="model",
|
45 |
local_dir=checkpoint_dir,
|
46 |
+
allow_patterns=["model.pth", "*.json"]
|
47 |
)
|
48 |
|
49 |
xtts_config = os.path.join(checkpoint_dir, "config.json")
|
|
|
57 |
if torch.cuda.is_available():
|
58 |
MODEL.cuda()
|
59 |
|
60 |
+
postprocessor = Postprocessor()
|
61 |
+
|
62 |
supported_languages = config.languages
|
63 |
if not "vi" in supported_languages:
|
64 |
supported_languages.append("vi")
|
|
|
78 |
.replace("A.I", "Ây Ai")
|
79 |
.replace("ad", "át")
|
80 |
.replace("marketing", "ma két tin")
|
81 |
+
.replace("tienziven", "tin di vần")
|
82 |
)
|
83 |
return text
|
84 |
|
|
|
178 |
keep_len = calculate_keep_len(prompt, language)
|
179 |
out["wav"] = out["wav"][:keep_len]
|
180 |
|
181 |
+
torchaudio.save("outputs/xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
182 |
+
|
183 |
+
postprocessor.convert_tone_color(
|
184 |
+
reference_speaker=speaker_wav,
|
185 |
+
src_path="outputs/xtts.wav",
|
186 |
+
save_path="outputs/openvoice.wav",
|
187 |
+
base_speaker=speaker_wav,
|
188 |
+
)
|
189 |
+
return "outputs/openvoice.wav"
|
190 |
|
191 |
except RuntimeError as e:
|
192 |
if "device-side assert" in str(e):
|
|
|
328 |
)
|
329 |
normalize_text = gr.Checkbox(
|
330 |
label="Chuẩn hóa văn bản tiếng Việt",
|
331 |
+
info="Chuẩn hóa văn bản tiếng Việt (vd: marketing -> ma két ting)",
|
332 |
value=True,
|
333 |
)
|
334 |
ref_dropdown = gr.Dropdown(
|
checkpoints_v2/base_speakers/ses/en-au.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e9782233deef51fc5289d05ad4dd4ce12b196e282eccf6b6db6256bbd02daaa
|
3 |
+
size 1701
|
checkpoints_v2/base_speakers/ses/en-br.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bf5a88025cfd10473b25d65d5c0e608338ce4533059c5f9a3383e69c812d389
|
3 |
+
size 1701
|
checkpoints_v2/base_speakers/ses/en-default.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4139de3bc2ea162f45a5a5f9559b710686c9689749b5ab8945ee5e2a082d154
|
3 |
+
size 1783
|
checkpoints_v2/base_speakers/ses/en-india.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad03d946757e95fe9e13239aa4b11071d98f22316f604f34b1a0b4bdf41cda48
|
3 |
+
size 1701
|
checkpoints_v2/base_speakers/ses/en-newest.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a3798229b1114f0e9cc137b33211809def7dda5a8a9398d5a112c0b42699177
|
3 |
+
size 1692
|
checkpoints_v2/base_speakers/ses/en-us.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d092d4af0815a4bfbc6105b65621ab68dc4c61b2f55044d8a66968a34947c32
|
3 |
+
size 1701
|
checkpoints_v2/base_speakers/ses/es.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8cece8853fb75b9f5217a1f5cda9807bac92a3e4c4547fc651e404d05deff63
|
3 |
+
size 1692
|
checkpoints_v2/base_speakers/ses/fr.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a01f6d30a73efa368c288a542a522a2bcdd4e2ec5589d8646b307cf8e2ad9ae
|
3 |
+
size 1692
|
checkpoints_v2/base_speakers/ses/jp.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b645ff428de4a57a22122318968f1e6127ac81fda2e2aa66062deccd3864416
|
3 |
+
size 1692
|
checkpoints_v2/base_speakers/ses/kr.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f501479d6072741a396725bec79144653e9f4a5381b85901e29683aa169795df
|
3 |
+
size 1692
|
checkpoints_v2/base_speakers/ses/zh.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b353de562700c13faacf096ecfc0adcafd26e6704a9feef572be1279714e031
|
3 |
+
size 1692
|
checkpoints_v2/converter/checkpoint.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
|
3 |
+
size 131320490
|
checkpoints_v2/converter/config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_version_": "v2",
|
3 |
+
"data": {
|
4 |
+
"sampling_rate": 22050,
|
5 |
+
"filter_length": 1024,
|
6 |
+
"hop_length": 256,
|
7 |
+
"win_length": 1024,
|
8 |
+
"n_speakers": 0
|
9 |
+
},
|
10 |
+
"model": {
|
11 |
+
"zero_g": true,
|
12 |
+
"inter_channels": 192,
|
13 |
+
"hidden_channels": 192,
|
14 |
+
"filter_channels": 768,
|
15 |
+
"n_heads": 2,
|
16 |
+
"n_layers": 6,
|
17 |
+
"kernel_size": 3,
|
18 |
+
"p_dropout": 0.1,
|
19 |
+
"resblock": "1",
|
20 |
+
"resblock_kernel_sizes": [
|
21 |
+
3,
|
22 |
+
7,
|
23 |
+
11
|
24 |
+
],
|
25 |
+
"resblock_dilation_sizes": [
|
26 |
+
[
|
27 |
+
1,
|
28 |
+
3,
|
29 |
+
5
|
30 |
+
],
|
31 |
+
[
|
32 |
+
1,
|
33 |
+
3,
|
34 |
+
5
|
35 |
+
],
|
36 |
+
[
|
37 |
+
1,
|
38 |
+
3,
|
39 |
+
5
|
40 |
+
]
|
41 |
+
],
|
42 |
+
"upsample_rates": [
|
43 |
+
8,
|
44 |
+
8,
|
45 |
+
2,
|
46 |
+
2
|
47 |
+
],
|
48 |
+
"upsample_initial_channel": 512,
|
49 |
+
"upsample_kernel_sizes": [
|
50 |
+
16,
|
51 |
+
16,
|
52 |
+
4,
|
53 |
+
4
|
54 |
+
],
|
55 |
+
"gin_channels": 256
|
56 |
+
}
|
57 |
+
}
|
output.wav
DELETED
Binary file (614 kB)
|
|
postprocess.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from openvoice import se_extractor
|
4 |
+
from openvoice.api import ToneColorConverter
|
5 |
+
import requests
|
6 |
+
import zipfile
|
7 |
+
import shutil
|
8 |
+
device="cuda:0" if torch.cuda.is_available() else "cpu"
|
9 |
+
output_dir = 'outputs'
|
10 |
+
|
11 |
+
# os.system("wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip")
|
12 |
+
# os.system("unzip checkpoints_v2_0417.zip")
|
13 |
+
|
14 |
+
ROOT = os.getcwd()
|
15 |
+
cache_dir = f"{ROOT}/.cache/torch/hub"
|
16 |
+
extract_dir = os.path.join(cache_dir, "snakers4_silero-vad_master")
|
17 |
+
url = "https://github.com/snakers4/silero-vad/zipball/master"
|
18 |
+
|
19 |
+
if not os.path.exists(extract_dir):
|
20 |
+
os.makedirs(cache_dir, exist_ok=True)
|
21 |
+
zip_path = os.path.join(cache_dir, "master.zip")
|
22 |
+
with requests.get(url, stream=True) as r:
|
23 |
+
r.raise_for_status()
|
24 |
+
with open(zip_path, 'wb') as f:
|
25 |
+
for chunk in r.iter_content(chunk_size=8192):
|
26 |
+
f.write(chunk)
|
27 |
+
temp_extract_dir = os.path.join(cache_dir, "temp_extract")
|
28 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
29 |
+
zip_ref.extractall(temp_extract_dir)
|
30 |
+
|
31 |
+
# Find the extracted folder and rename it
|
32 |
+
extracted_folder = os.path.join(temp_extract_dir, os.listdir(temp_extract_dir)[0])
|
33 |
+
shutil.move(extracted_folder, extract_dir)
|
34 |
+
shutil.rmtree(temp_extract_dir)
|
35 |
+
|
36 |
+
class Postprocessor:
|
37 |
+
def __init__(self):
|
38 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
39 |
+
self.tone_color_converter = ToneColorConverter('checkpoints_v2/converter/config.json', device=device)
|
40 |
+
self.tone_color_converter.load_ckpt('checkpoints_v2/converter/checkpoint.pth')
|
41 |
+
os.makedirs(output_dir, exist_ok=True)
|
42 |
+
|
43 |
+
def convert_tone_color(self, reference_speaker, src_path="outputs/xtts.wav", save_path="outputs/openvoice.wav", base_speaker="outputs/xtts.wav"):
|
44 |
+
source_se, audio_name = se_extractor.get_se(base_speaker, self.tone_color_converter, vad=True)
|
45 |
+
target_se, audio_name = se_extractor.get_se(reference_speaker, self.tone_color_converter, vad=True)
|
46 |
+
|
47 |
+
encode_message = "@MyShell"
|
48 |
+
|
49 |
+
self.tone_color_converter.convert(
|
50 |
+
audio_src_path=src_path,
|
51 |
+
src_se=source_se,
|
52 |
+
tgt_se=target_se,
|
53 |
+
output_path=save_path,
|
54 |
+
message=encode_message)
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
postprocessor = Postprocessor()
|
58 |
+
postprocessor.convert_tone_color(reference_speaker="outputs/xtts.wav")
|
59 |
+
print("Done!")
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
|
|
|
|
|
2 |
typing-extensions>=4.8.0
|
3 |
cutlet
|
4 |
mecab-python3==1.0.6
|
@@ -11,6 +13,10 @@ gradio==4.36.1
|
|
11 |
spaces
|
12 |
huggingface_hub
|
13 |
python-docx
|
|
|
|
|
|
|
|
|
14 |
# Vietnamese 101
|
15 |
vinorm==2.0.7
|
16 |
underthesea==6.8.0
|
|
|
1 |
+
git+https://github.com/myshell-ai/OpenVoice.git
|
2 |
+
git+https://github.com/myshell-ai/MeloTTS.git
|
3 |
+
TTS @ git+https://github.com/thinhlpg/TTS.git@ff217b3f27b294de194cc59c5119d1e08b06413c
|
4 |
typing-extensions>=4.8.0
|
5 |
cutlet
|
6 |
mecab-python3==1.0.6
|
|
|
13 |
spaces
|
14 |
huggingface_hub
|
15 |
python-docx
|
16 |
+
|
17 |
+
faster-whisper
|
18 |
+
whisper-timestamped
|
19 |
+
wavmark
|
20 |
# Vietnamese 101
|
21 |
vinorm==2.0.7
|
22 |
underthesea==6.8.0
|