Nguyen Ha Lan commited on
Commit
d1199f2
·
1 Parent(s): 2498595

add postprocess voice

Browse files
.gitignore CHANGED
@@ -1 +1,4 @@
1
- model
 
 
 
 
1
+ model
2
+ outputs
3
+ processed
4
+ checkpoints_v2_0417.zip
__pycache__/openvoice.cpython-310.pyc ADDED
Binary file (1.94 kB). View file
 
__pycache__/postprocess.cpython-310.pyc ADDED
Binary file (2.35 kB). View file
 
app.py CHANGED
@@ -18,6 +18,10 @@ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
18
  from TTS.tts.configs.xtts_config import XttsConfig
19
  from TTS.tts.models.xtts import Xtts
20
  from vinorm import TTSnorm
 
 
 
 
21
 
22
  # download for mecab
23
  HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -39,6 +43,7 @@ if not all(file in files_in_dir for file in required_files):
39
  repo_id=repo_id,
40
  repo_type="model",
41
  local_dir=checkpoint_dir,
 
42
  )
43
 
44
  xtts_config = os.path.join(checkpoint_dir, "config.json")
@@ -52,6 +57,8 @@ MODEL.load_checkpoint(
52
  if torch.cuda.is_available():
53
  MODEL.cuda()
54
 
 
 
55
  supported_languages = config.languages
56
  if not "vi" in supported_languages:
57
  supported_languages.append("vi")
@@ -71,7 +78,7 @@ def normalize_vietnamese_text(text):
71
  .replace("A.I", "Ây Ai")
72
  .replace("ad", "át")
73
  .replace("marketing", "ma két tin")
74
- .replace("tienziven", "tien di vần")
75
  )
76
  return text
77
 
@@ -171,7 +178,15 @@ def predict(
171
  keep_len = calculate_keep_len(prompt, language)
172
  out["wav"] = out["wav"][:keep_len]
173
 
174
- torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
 
 
 
 
 
 
 
 
175
 
176
  except RuntimeError as e:
177
  if "device-side assert" in str(e):
@@ -313,7 +328,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
313
  )
314
  normalize_text = gr.Checkbox(
315
  label="Chuẩn hóa văn bản tiếng Việt",
316
- info="Normalize Vietnamese text",
317
  value=True,
318
  )
319
  ref_dropdown = gr.Dropdown(
 
18
  from TTS.tts.configs.xtts_config import XttsConfig
19
  from TTS.tts.models.xtts import Xtts
20
  from vinorm import TTSnorm
21
+ from postprocess import Postprocessor
22
+
23
+ output_dir = 'outputs'
24
+ os.makedirs(output_dir, exist_ok=True)
25
 
26
  # download for mecab
27
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
43
  repo_id=repo_id,
44
  repo_type="model",
45
  local_dir=checkpoint_dir,
46
+ allow_patterns=["model.pth", "*.json"]
47
  )
48
 
49
  xtts_config = os.path.join(checkpoint_dir, "config.json")
 
57
  if torch.cuda.is_available():
58
  MODEL.cuda()
59
 
60
+ postprocessor = Postprocessor()
61
+
62
  supported_languages = config.languages
63
  if not "vi" in supported_languages:
64
  supported_languages.append("vi")
 
78
  .replace("A.I", "Ây Ai")
79
  .replace("ad", "át")
80
  .replace("marketing", "ma két tin")
81
+ .replace("tienziven", "tin di vần")
82
  )
83
  return text
84
 
 
178
  keep_len = calculate_keep_len(prompt, language)
179
  out["wav"] = out["wav"][:keep_len]
180
 
181
+ torchaudio.save("outputs/xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
182
+
183
+ postprocessor.convert_tone_color(
184
+ reference_speaker=speaker_wav,
185
+ src_path="outputs/xtts.wav",
186
+ save_path="outputs/openvoice.wav",
187
+ base_speaker=speaker_wav,
188
+ )
189
+ return "outputs/openvoice.wav"
190
 
191
  except RuntimeError as e:
192
  if "device-side assert" in str(e):
 
328
  )
329
  normalize_text = gr.Checkbox(
330
  label="Chuẩn hóa văn bản tiếng Việt",
331
+ info="Chuẩn hóa văn bản tiếng Việt (vd: marketing -> ma két ting)",
332
  value=True,
333
  )
334
  ref_dropdown = gr.Dropdown(
checkpoints_v2/base_speakers/ses/en-au.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e9782233deef51fc5289d05ad4dd4ce12b196e282eccf6b6db6256bbd02daaa
3
+ size 1701
checkpoints_v2/base_speakers/ses/en-br.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf5a88025cfd10473b25d65d5c0e608338ce4533059c5f9a3383e69c812d389
3
+ size 1701
checkpoints_v2/base_speakers/ses/en-default.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4139de3bc2ea162f45a5a5f9559b710686c9689749b5ab8945ee5e2a082d154
3
+ size 1783
checkpoints_v2/base_speakers/ses/en-india.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad03d946757e95fe9e13239aa4b11071d98f22316f604f34b1a0b4bdf41cda48
3
+ size 1701
checkpoints_v2/base_speakers/ses/en-newest.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3798229b1114f0e9cc137b33211809def7dda5a8a9398d5a112c0b42699177
3
+ size 1692
checkpoints_v2/base_speakers/ses/en-us.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d092d4af0815a4bfbc6105b65621ab68dc4c61b2f55044d8a66968a34947c32
3
+ size 1701
checkpoints_v2/base_speakers/ses/es.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8cece8853fb75b9f5217a1f5cda9807bac92a3e4c4547fc651e404d05deff63
3
+ size 1692
checkpoints_v2/base_speakers/ses/fr.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a01f6d30a73efa368c288a542a522a2bcdd4e2ec5589d8646b307cf8e2ad9ae
3
+ size 1692
checkpoints_v2/base_speakers/ses/jp.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b645ff428de4a57a22122318968f1e6127ac81fda2e2aa66062deccd3864416
3
+ size 1692
checkpoints_v2/base_speakers/ses/kr.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f501479d6072741a396725bec79144653e9f4a5381b85901e29683aa169795df
3
+ size 1692
checkpoints_v2/base_speakers/ses/zh.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b353de562700c13faacf096ecfc0adcafd26e6704a9feef572be1279714e031
3
+ size 1692
checkpoints_v2/converter/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
3
+ size 131320490
checkpoints_v2/converter/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_version_": "v2",
3
+ "data": {
4
+ "sampling_rate": 22050,
5
+ "filter_length": 1024,
6
+ "hop_length": 256,
7
+ "win_length": 1024,
8
+ "n_speakers": 0
9
+ },
10
+ "model": {
11
+ "zero_g": true,
12
+ "inter_channels": 192,
13
+ "hidden_channels": 192,
14
+ "filter_channels": 768,
15
+ "n_heads": 2,
16
+ "n_layers": 6,
17
+ "kernel_size": 3,
18
+ "p_dropout": 0.1,
19
+ "resblock": "1",
20
+ "resblock_kernel_sizes": [
21
+ 3,
22
+ 7,
23
+ 11
24
+ ],
25
+ "resblock_dilation_sizes": [
26
+ [
27
+ 1,
28
+ 3,
29
+ 5
30
+ ],
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ]
41
+ ],
42
+ "upsample_rates": [
43
+ 8,
44
+ 8,
45
+ 2,
46
+ 2
47
+ ],
48
+ "upsample_initial_channel": 512,
49
+ "upsample_kernel_sizes": [
50
+ 16,
51
+ 16,
52
+ 4,
53
+ 4
54
+ ],
55
+ "gin_channels": 256
56
+ }
57
+ }
output.wav DELETED
Binary file (614 kB)
 
postprocess.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from openvoice import se_extractor
4
+ from openvoice.api import ToneColorConverter
5
+ import requests
6
+ import zipfile
7
+ import shutil
8
+ device="cuda:0" if torch.cuda.is_available() else "cpu"
9
+ output_dir = 'outputs'
10
+
11
+ # os.system("wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip")
12
+ # os.system("unzip checkpoints_v2_0417.zip")
13
+
14
+ ROOT = os.getcwd()
15
+ cache_dir = f"{ROOT}/.cache/torch/hub"
16
+ extract_dir = os.path.join(cache_dir, "snakers4_silero-vad_master")
17
+ url = "https://github.com/snakers4/silero-vad/zipball/master"
18
+
19
+ if not os.path.exists(extract_dir):
20
+ os.makedirs(cache_dir, exist_ok=True)
21
+ zip_path = os.path.join(cache_dir, "master.zip")
22
+ with requests.get(url, stream=True) as r:
23
+ r.raise_for_status()
24
+ with open(zip_path, 'wb') as f:
25
+ for chunk in r.iter_content(chunk_size=8192):
26
+ f.write(chunk)
27
+ temp_extract_dir = os.path.join(cache_dir, "temp_extract")
28
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
29
+ zip_ref.extractall(temp_extract_dir)
30
+
31
+ # Find the extracted folder and rename it
32
+ extracted_folder = os.path.join(temp_extract_dir, os.listdir(temp_extract_dir)[0])
33
+ shutil.move(extracted_folder, extract_dir)
34
+ shutil.rmtree(temp_extract_dir)
35
+
36
+ class Postprocessor:
37
+ def __init__(self):
38
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
39
+ self.tone_color_converter = ToneColorConverter('checkpoints_v2/converter/config.json', device=device)
40
+ self.tone_color_converter.load_ckpt('checkpoints_v2/converter/checkpoint.pth')
41
+ os.makedirs(output_dir, exist_ok=True)
42
+
43
+ def convert_tone_color(self, reference_speaker, src_path="outputs/xtts.wav", save_path="outputs/openvoice.wav", base_speaker="outputs/xtts.wav"):
44
+ source_se, audio_name = se_extractor.get_se(base_speaker, self.tone_color_converter, vad=True)
45
+ target_se, audio_name = se_extractor.get_se(reference_speaker, self.tone_color_converter, vad=True)
46
+
47
+ encode_message = "@MyShell"
48
+
49
+ self.tone_color_converter.convert(
50
+ audio_src_path=src_path,
51
+ src_se=source_se,
52
+ tgt_se=target_se,
53
+ output_path=save_path,
54
+ message=encode_message)
55
+
56
+ if __name__ == "__main__":
57
+ postprocessor = Postprocessor()
58
+ postprocessor.convert_tone_color(reference_speaker="outputs/xtts.wav")
59
+ print("Done!")
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
- # TTS @ git+https://github.com/thinhlpg/TTS.git@ff217b3f27b294de194cc59c5119d1e08b06413c
 
 
2
  typing-extensions>=4.8.0
3
  cutlet
4
  mecab-python3==1.0.6
@@ -11,6 +13,10 @@ gradio==4.36.1
11
  spaces
12
  huggingface_hub
13
  python-docx
 
 
 
 
14
  # Vietnamese 101
15
  vinorm==2.0.7
16
  underthesea==6.8.0
 
1
+ git+https://github.com/myshell-ai/OpenVoice.git
2
+ git+https://github.com/myshell-ai/MeloTTS.git
3
+ TTS @ git+https://github.com/thinhlpg/TTS.git@ff217b3f27b294de194cc59c5119d1e08b06413c
4
  typing-extensions>=4.8.0
5
  cutlet
6
  mecab-python3==1.0.6
 
13
  spaces
14
  huggingface_hub
15
  python-docx
16
+
17
+ faster-whisper
18
+ whisper-timestamped
19
+ wavmark
20
  # Vietnamese 101
21
  vinorm==2.0.7
22
  underthesea==6.8.0