Spaces:

AnKhanh
/

Vietnamese-text-to-speech-tienziven

Paused

@@ -18,6 +18,10 @@ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
 # download for mecab
 HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -39,6 +43,7 @@ if not all(file in files_in_dir for file in required_files):
         repo_id=repo_id,
         repo_type="model",
         local_dir=checkpoint_dir,
     )
 xtts_config = os.path.join(checkpoint_dir, "config.json")
@@ -52,6 +57,8 @@ MODEL.load_checkpoint(
 if torch.cuda.is_available():
     MODEL.cuda()
 supported_languages = config.languages
 if not "vi" in supported_languages:
     supported_languages.append("vi")
@@ -71,7 +78,7 @@ def normalize_vietnamese_text(text):
         .replace("A.I", "Ây Ai")
         .replace("ad", "át")
         .replace("marketing", "ma két tin")
-        .replace("tienziven", "tien di vần")
     )
     return text
@@ -171,7 +178,15 @@ def predict(
         keep_len = calculate_keep_len(prompt, language)
         out["wav"] = out["wav"][:keep_len]
-        torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
     except RuntimeError as e:
         if "device-side assert" in str(e):
@@ -313,7 +328,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             )
             normalize_text = gr.Checkbox(
                 label="Chuẩn hóa văn bản tiếng Việt",
-                info="Normalize Vietnamese text",
                 value=True,
             )
             ref_dropdown = gr.Dropdown(

 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
+from postprocess import Postprocessor
+output_dir = 'outputs'
+os.makedirs(output_dir, exist_ok=True)
 # download for mecab
 HF_TOKEN = os.environ.get("HF_TOKEN")
         repo_id=repo_id,
         repo_type="model",
         local_dir=checkpoint_dir,
+        allow_patterns=["model.pth", "*.json"]
     )
 xtts_config = os.path.join(checkpoint_dir, "config.json")
 if torch.cuda.is_available():
     MODEL.cuda()
+postprocessor = Postprocessor()
 supported_languages = config.languages
 if not "vi" in supported_languages:
     supported_languages.append("vi")
         .replace("A.I", "Ây Ai")
         .replace("ad", "át")
         .replace("marketing", "ma két tin")
+        .replace("tienziven", "tin di vần")
     )
     return text
         keep_len = calculate_keep_len(prompt, language)
         out["wav"] = out["wav"][:keep_len]
+        torchaudio.save("outputs/xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
+        postprocessor.convert_tone_color(
+            reference_speaker=speaker_wav,
+            src_path="outputs/xtts.wav",
+            save_path="outputs/openvoice.wav",
+            base_speaker=speaker_wav,
+        )
+        return "outputs/openvoice.wav"
     except RuntimeError as e:
         if "device-side assert" in str(e):
             )
             normalize_text = gr.Checkbox(
                 label="Chuẩn hóa văn bản tiếng Việt",
+                info="Chuẩn hóa văn bản tiếng Việt (vd: marketing -> ma két ting)",
                 value=True,
             )
             ref_dropdown = gr.Dropdown(

checkpoints_v2/base_speakers/ses/en-au.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e9782233deef51fc5289d05ad4dd4ce12b196e282eccf6b6db6256bbd02daaa
+size 1701

checkpoints_v2/base_speakers/ses/en-br.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf5a88025cfd10473b25d65d5c0e608338ce4533059c5f9a3383e69c812d389
+size 1701

checkpoints_v2/base_speakers/ses/en-default.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4139de3bc2ea162f45a5a5f9559b710686c9689749b5ab8945ee5e2a082d154
+size 1783

checkpoints_v2/base_speakers/ses/en-india.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad03d946757e95fe9e13239aa4b11071d98f22316f604f34b1a0b4bdf41cda48
+size 1701

checkpoints_v2/base_speakers/ses/en-newest.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a3798229b1114f0e9cc137b33211809def7dda5a8a9398d5a112c0b42699177
+size 1692

checkpoints_v2/base_speakers/ses/en-us.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d092d4af0815a4bfbc6105b65621ab68dc4c61b2f55044d8a66968a34947c32
+size 1701

checkpoints_v2/base_speakers/ses/es.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8cece8853fb75b9f5217a1f5cda9807bac92a3e4c4547fc651e404d05deff63
+size 1692

checkpoints_v2/base_speakers/ses/fr.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a01f6d30a73efa368c288a542a522a2bcdd4e2ec5589d8646b307cf8e2ad9ae
+size 1692

checkpoints_v2/base_speakers/ses/jp.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b645ff428de4a57a22122318968f1e6127ac81fda2e2aa66062deccd3864416
+size 1692

checkpoints_v2/base_speakers/ses/kr.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f501479d6072741a396725bec79144653e9f4a5381b85901e29683aa169795df
+size 1692

checkpoints_v2/base_speakers/ses/zh.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b353de562700c13faacf096ecfc0adcafd26e6704a9feef572be1279714e031
+size 1692

checkpoints_v2/converter/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
+size 131320490

checkpoints_v2/converter/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "_version_": "v2",
+  "data": {
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_speakers": 0
+  },
+  "model": {
+    "zero_g": true,
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "gin_channels": 256
+  }
+}

output.wav DELETED Viewed

Binary file (614 kB)

postprocess.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import torch
+from openvoice import se_extractor
+from openvoice.api import ToneColorConverter
+import requests
+import zipfile
+import shutil
+device="cuda:0" if torch.cuda.is_available() else "cpu"
+output_dir = 'outputs'
+# os.system("wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip")
+# os.system("unzip checkpoints_v2_0417.zip")
+ROOT = os.getcwd()
+cache_dir = f"{ROOT}/.cache/torch/hub"
+extract_dir = os.path.join(cache_dir, "snakers4_silero-vad_master")
+url = "https://github.com/snakers4/silero-vad/zipball/master"
+if not os.path.exists(extract_dir):
+    os.makedirs(cache_dir, exist_ok=True)
+    zip_path = os.path.join(cache_dir, "master.zip")
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(zip_path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+    temp_extract_dir = os.path.join(cache_dir, "temp_extract")
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(temp_extract_dir)
+    # Find the extracted folder and rename it
+    extracted_folder = os.path.join(temp_extract_dir, os.listdir(temp_extract_dir)[0])
+    shutil.move(extracted_folder, extract_dir)
+    shutil.rmtree(temp_extract_dir)
+class Postprocessor:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tone_color_converter = ToneColorConverter('checkpoints_v2/converter/config.json', device=device)
+        self.tone_color_converter.load_ckpt('checkpoints_v2/converter/checkpoint.pth')
+        os.makedirs(output_dir, exist_ok=True)
+    def convert_tone_color(self, reference_speaker, src_path="outputs/xtts.wav", save_path="outputs/openvoice.wav", base_speaker="outputs/xtts.wav"):
+        source_se, audio_name = se_extractor.get_se(base_speaker, self.tone_color_converter, vad=True)
+        target_se, audio_name = se_extractor.get_se(reference_speaker, self.tone_color_converter, vad=True)
+        encode_message = "@MyShell"
+        self.tone_color_converter.convert(
+            audio_src_path=src_path,
+            src_se=source_se,
+            tgt_se=target_se,
+            output_path=save_path,
+            message=encode_message)
+if __name__ == "__main__":
+    postprocessor = Postprocessor()
+    postprocessor.convert_tone_color(reference_speaker="outputs/xtts.wav")
+    print("Done!")

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
-# TTS @ git+https://github.com/thinhlpg/TTS.git@ff217b3f27b294de194cc59c5119d1e08b06413c
 typing-extensions>=4.8.0
 cutlet
 mecab-python3==1.0.6
@@ -11,6 +13,10 @@ gradio==4.36.1
 spaces
 huggingface_hub
 python-docx
 # Vietnamese 101
 vinorm==2.0.7
 underthesea==6.8.0

+git+https://github.com/myshell-ai/OpenVoice.git
+git+https://github.com/myshell-ai/MeloTTS.git
+TTS @ git+https://github.com/thinhlpg/TTS.git@ff217b3f27b294de194cc59c5119d1e08b06413c
 typing-extensions>=4.8.0
 cutlet
 mecab-python3==1.0.6
 spaces
 huggingface_hub
 python-docx
+faster-whisper
+whisper-timestamped
+wavmark
 # Vietnamese 101
 vinorm==2.0.7
 underthesea==6.8.0