add stop duration option
Browse files- app.py +93 -8
- requirements.txt +2 -1
- vietTTS/hifigan/mel2wave.py +0 -4
- vietTTS/nat/text2mel.py +0 -6
app.py
CHANGED
@@ -6,33 +6,118 @@ import gradio as gr
|
|
6 |
import os
|
7 |
|
8 |
|
9 |
-
def text_to_speech(text,
|
10 |
print("starting")
|
11 |
# prevent too long text
|
12 |
if len(text) > 500:
|
13 |
text = text[:500]
|
14 |
# stop_duration_float = float(stop_duration_text)
|
|
|
15 |
text = nat_normalize_text(text)
|
16 |
mel = text2mel(
|
17 |
text,
|
18 |
"lexicon.txt",
|
19 |
-
|
20 |
"acoustic_latest_ckpt.pickle",
|
21 |
"duration_latest_ckpt.pickle",
|
22 |
)
|
23 |
-
print("mel")
|
24 |
-
print(mel)
|
25 |
wave = mel2wave(mel, "config.json", "hk_hifi.pickle")
|
26 |
-
print("wave")
|
27 |
-
print(wave)
|
28 |
return (wave * (2**15)).astype(np.int16)
|
29 |
|
30 |
|
31 |
-
def speak(text,
|
32 |
-
y = text_to_speech(text,
|
33 |
return 16_000, y
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
title = "SLT TTS"
|
37 |
description = "SLT Vietnamese Text to speech demo."
|
38 |
|
|
|
6 |
import os
|
7 |
|
8 |
|
9 |
+
def text_to_speech(text,stop_duration):
|
10 |
print("starting")
|
11 |
# prevent too long text
|
12 |
if len(text) > 500:
|
13 |
text = text[:500]
|
14 |
# stop_duration_float = float(stop_duration_text)
|
15 |
+
text = clean_text(text)
|
16 |
text = nat_normalize_text(text)
|
17 |
mel = text2mel(
|
18 |
text,
|
19 |
"lexicon.txt",
|
20 |
+
stop_duration,
|
21 |
"acoustic_latest_ckpt.pickle",
|
22 |
"duration_latest_ckpt.pickle",
|
23 |
)
|
|
|
|
|
24 |
wave = mel2wave(mel, "config.json", "hk_hifi.pickle")
|
|
|
|
|
25 |
return (wave * (2**15)).astype(np.int16)
|
26 |
|
27 |
|
28 |
+
def speak(text,stop_duration):
|
29 |
+
y = text_to_speech(text,stop_duration)
|
30 |
return 16_000, y
|
31 |
|
32 |
|
33 |
+
|
34 |
+
def clean_text(test_string):
|
35 |
+
list_word = test_string.split()
|
36 |
+
# print(list_word)
|
37 |
+
|
38 |
+
regex = r"\d{2}(?P<sep>[-/])\d{1,2}(?P=sep)\d{4}"
|
39 |
+
|
40 |
+
for word in list_word :
|
41 |
+
try:
|
42 |
+
# print(word)
|
43 |
+
searchbox_result = re.match(regex, word)
|
44 |
+
day = searchbox_result.group(0)
|
45 |
+
day2 = day
|
46 |
+
day2 = day2.replace('/',' ').replace('-',' ')
|
47 |
+
list_date = day2.split(' ')
|
48 |
+
date_result = 'Ngày ' + n2w(list_date[0]) + ' tháng ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1]) + ' năm ' + n2w(list_date[2])
|
49 |
+
# print(date_result)
|
50 |
+
test_string = test_string.replace(word, date_result)
|
51 |
+
|
52 |
+
except AttributeError:
|
53 |
+
# print(word)
|
54 |
+
# print("can't make a group")
|
55 |
+
continue
|
56 |
+
|
57 |
+
|
58 |
+
# print(test_string)
|
59 |
+
|
60 |
+
regex2 = r"\d{2}(?P<sep>[-/])\d{1,2}"
|
61 |
+
|
62 |
+
for word in list_word :
|
63 |
+
try:
|
64 |
+
# print(word)
|
65 |
+
searchbox_result = re.match(regex2, word)
|
66 |
+
day = searchbox_result.group(0)
|
67 |
+
day2 = day
|
68 |
+
day2 = day2.replace('/',' ').replace('-',' ')
|
69 |
+
list_date = day2.split(' ')
|
70 |
+
date_result = 'Ngày ' + n2w(list_date[0]) + ' tháng ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1])
|
71 |
+
# print(date_result)
|
72 |
+
test_string = test_string.replace(word, date_result)
|
73 |
+
|
74 |
+
except AttributeError:
|
75 |
+
# print(word)
|
76 |
+
# print("can't make a group")
|
77 |
+
continue
|
78 |
+
|
79 |
+
|
80 |
+
# print(test_string)
|
81 |
+
|
82 |
+
regex3 = r"\d{1,2}(?P<sep>[h:])\d{1,2}"
|
83 |
+
|
84 |
+
for word in list_word :
|
85 |
+
try:
|
86 |
+
# print(word)
|
87 |
+
searchbox_result = re.match(regex3, word)
|
88 |
+
day = searchbox_result.group(0)
|
89 |
+
day2 = day
|
90 |
+
day2 = day2.replace('h',' ').replace(':',' ')
|
91 |
+
list_date = day2.split(' ')
|
92 |
+
date_result = n2w(list_date[0]) + ' giờ ' + n2w(list_date[1].replace('0','') if list_date[1].startswith('0') else list_date[1]) + ' phút '
|
93 |
+
# print(date_result)
|
94 |
+
test_string = test_string.replace(word, date_result)
|
95 |
+
|
96 |
+
except AttributeError:
|
97 |
+
# print(word)
|
98 |
+
# print("can't make a group")
|
99 |
+
continue
|
100 |
+
|
101 |
+
|
102 |
+
print(test_string)
|
103 |
+
|
104 |
+
for word in list_word :
|
105 |
+
try:
|
106 |
+
if word.isdigit() :
|
107 |
+
# print(word)
|
108 |
+
text_result = n2w_single(word)
|
109 |
+
# print(text_result)
|
110 |
+
test_string = test_string.replace(word, text_result, 1)
|
111 |
+
|
112 |
+
except AttributeError:
|
113 |
+
# print(word)
|
114 |
+
print("can't make a group")
|
115 |
+
continue
|
116 |
+
|
117 |
+
|
118 |
+
return test_string
|
119 |
+
|
120 |
+
|
121 |
title = "SLT TTS"
|
122 |
description = "SLT Vietnamese Text to speech demo."
|
123 |
|
requirements.txt
CHANGED
@@ -10,4 +10,5 @@ tabulate
|
|
10 |
textgrid@ git+https://github.com/kylebgorman/textgrid.git
|
11 |
tqdm
|
12 |
matplotlib
|
13 |
-
|
|
|
|
10 |
textgrid@ git+https://github.com/kylebgorman/textgrid.git
|
11 |
tqdm
|
12 |
matplotlib
|
13 |
+
noisereduce
|
14 |
+
vietnam_number
|
vietTTS/hifigan/mel2wave.py
CHANGED
@@ -37,10 +37,6 @@ def mel2wave(
|
|
37 |
aux = {}
|
38 |
wav, aux = forward.apply(params, aux, rng, mel)
|
39 |
wav = jnp.squeeze(wav)
|
40 |
-
print("wav : ")
|
41 |
-
print(wav)
|
42 |
jax.config.update('jax_platform_name', 'cpu')
|
43 |
audio = jax.device_get(wav)
|
44 |
-
print("audio : ")
|
45 |
-
print(audio)
|
46 |
return audio
|
|
|
37 |
aux = {}
|
38 |
wav, aux = forward.apply(params, aux, rng, mel)
|
39 |
wav = jnp.squeeze(wav)
|
|
|
|
|
40 |
jax.config.update('jax_platform_name', 'cpu')
|
41 |
audio = jax.device_get(wav)
|
|
|
|
|
42 |
return audio
|
vietTTS/nat/text2mel.py
CHANGED
@@ -100,13 +100,7 @@ def text2mel(
|
|
100 |
durations = jnp.where(
|
101 |
np.array(tokens)[None, :] == FLAGS.word_end_index, 0.0, durations
|
102 |
)
|
103 |
-
print("acoustic_ckpt : ")
|
104 |
-
print(acoustic_ckpt)
|
105 |
-
print("duration_ckpt : ")
|
106 |
-
print(duration_ckpt)
|
107 |
mels = predict_mel(tokens, durations, acoustic_ckpt)
|
108 |
-
print("mels : ")
|
109 |
-
print(mels)
|
110 |
if tokens[-1] == FLAGS.sil_index:
|
111 |
end_silence = durations[0, -1].item()
|
112 |
silence_frame = int(end_silence * FLAGS.sample_rate / (FLAGS.n_fft // 4))
|
|
|
100 |
durations = jnp.where(
|
101 |
np.array(tokens)[None, :] == FLAGS.word_end_index, 0.0, durations
|
102 |
)
|
|
|
|
|
|
|
|
|
103 |
mels = predict_mel(tokens, durations, acoustic_ckpt)
|
|
|
|
|
104 |
if tokens[-1] == FLAGS.sil_index:
|
105 |
end_silence = durations[0, -1].item()
|
106 |
silence_frame = int(end_silence * FLAGS.sample_rate / (FLAGS.n_fft // 4))
|