toandev commited on
Commit
13a0cd9
·
0 Parent(s):
Files changed (6) hide show
  1. .gitignore +2 -0
  2. README.md +12 -0
  3. app.py +102 -0
  4. examples/01.wav +0 -0
  5. examples/02.wav +0 -0
  6. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ .gradio
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: F5-TTS-Vietnamese
3
+ emoji: 🗣️
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.13.1
8
+ app_file: app.py
9
+ pinned: true
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ from cached_path import cached_path
4
+ import tempfile
5
+
6
+ from f5_tts.model import DiT
7
+ from f5_tts.infer.utils_infer import (
8
+ preprocess_ref_audio_text,
9
+ load_vocoder,
10
+ load_model,
11
+ infer_process,
12
+ save_spectrogram,
13
+ )
14
+
15
+
16
+ vocoder = load_vocoder()
17
+ model = load_model(
18
+ DiT,
19
+ dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
20
+ ckpt_path=str(
21
+ cached_path("hf://toandev/F5-TTS-Vietnamese/model_latest.safetensors")
22
+ ),
23
+ vocab_file=str(cached_path("hf://toandev/F5-TTS-Vietnamese/vocab.txt")),
24
+ )
25
+
26
+
27
+ @spaces.GPU
28
+ def infer(ref_audio_orig: str, gen_text: str, speed: float = 1.0):
29
+ if ref_audio_orig is None:
30
+ raise gr.Error("Reference audio is required.")
31
+
32
+ if gen_text is None or gen_text.strip() == "":
33
+ raise gr.Error("Text to generate is required.")
34
+
35
+ try:
36
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
37
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
38
+ ref_audio,
39
+ ref_text,
40
+ gen_text,
41
+ model,
42
+ vocoder,
43
+ cross_fade_duration=0.15,
44
+ nfe_step=32,
45
+ speed=speed,
46
+ )
47
+
48
+ with tempfile.NamedTemporaryFile(
49
+ suffix=".png", delete=False
50
+ ) as tmp_spectrogram:
51
+ spectrogram_path = tmp_spectrogram.name
52
+ save_spectrogram(combined_spectrogram, spectrogram_path)
53
+
54
+ return (final_sample_rate, final_wave), spectrogram_path
55
+ except Exception as e:
56
+ raise gr.Error(f"An error occurred during inference: {e}")
57
+
58
+
59
+ iface = gr.Interface(
60
+ title="F5-TTS Vietnamese",
61
+ description="Based on the [F5-TTS](https://github.com/SWivid/F5-TTS) model, a Diffusion Transformer with ConvNeXt V2, this Vietnamese text-to-speech model was trained on ~4 hours of Vietnamese audio data in 41k training steps. It boasts faster training and inference speeds, however, the quality of the synthesized speech may have noticeable imperfections such as choppiness or lack of natural intonation.",
62
+ fn=infer,
63
+ inputs=[
64
+ gr.components.Audio(type="filepath", label="Reference Audio"),
65
+ gr.components.Textbox(label="Text to Generate", lines=3),
66
+ gr.components.Slider(
67
+ label="Speed",
68
+ minimum=0.3,
69
+ maximum=2.0,
70
+ value=1.0,
71
+ step=0.1,
72
+ info="Adjust the speed of the audio.",
73
+ ),
74
+ ],
75
+ outputs=[
76
+ gr.components.Audio(type="numpy", label="Synthesized Audio"),
77
+ gr.components.Image(type="filepath", label="Spectrogram"),
78
+ ],
79
+ submit_btn="Synthesize",
80
+ clear_btn=None,
81
+ flagging_mode="never",
82
+ examples=[
83
+ [
84
+ "examples/01.wav",
85
+ "Kiểm soát cảm xúc thực chất là một quá trình đánh giá lại bản thân, để tìm thấy tự do, thoát khỏi sự cuốn hút của chính bản ngã.",
86
+ 0.8,
87
+ ],
88
+ [
89
+ "examples/02.wav",
90
+ "Ngoài ra, nội dung ở bên kênh đấy tôi sẽ cố gắng là không nói bậy nhá.",
91
+ 1.0,
92
+ ],
93
+ [
94
+ "examples/01.wav",
95
+ "Cho tôi năm trăm triệu tôi sẽ gạch tên Pew và con tôi ra khỏi danh sách bạn bè, thực tế còn chịu tham gia một trận bốc xing để kết thúc tình nghĩa.",
96
+ 0.8,
97
+ ],
98
+ ],
99
+ )
100
+
101
+ if __name__ == "__main__":
102
+ iface.queue().launch()
examples/01.wav ADDED
Binary file (209 kB). View file
 
examples/02.wav ADDED
Binary file (365 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ soundfile
4
+ transformers
5
+ f5_tts @ git+https://github.com/SWivid/F5-TTS.git
6
+ bitsandbytes>0.37.0