Spaces:

tobiccino
/

tts

Sleeping

App Files Files Community

tobiccino commited on Feb 17, 2023

Commit

725d577

•

1 Parent(s): 5386471

update

Browse files

Files changed (2) hide show

vietTTS/hifigan/mel2wave.py +7 -7
vietTTS/nat/text2mel.py +11 -8

vietTTS/hifigan/mel2wave.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import json
-import os
 import pickle
 import haiku as hk
 import jax
 import jax.numpy as jnp
-import numpy as np
 from .config import FLAGS
 from .model import Generator
@@ -17,9 +15,11 @@ class AttrDict(dict):
         self.__dict__ = self
-def mel2wave(mel):
-    config_file = "assets/hifigan/config.json"
-    MAX_WAV_VALUE = 32768.0
     with open(config_file) as f:
         data = f.read()
     json_config = json.loads(data)
@@ -32,10 +32,10 @@ def mel2wave(mel):
     rng = next(hk.PRNGSequence(42))
-    with open(FLAGS.ckpt_dir / "hk_hifi.pickle", "rb") as f:
         params = pickle.load(f)
     aux = {}
     wav, aux = forward.apply(params, aux, rng, mel)
     wav = jnp.squeeze(wav)
     audio = jax.device_get(wav)
-    return audio

 import json
 import pickle
 import haiku as hk
 import jax
 import jax.numpy as jnp
 from .config import FLAGS
 from .model import Generator
         self.__dict__ = self
+def mel2wave(
+    mel,
+    config_file="assets/hifigan/config.json",
+    ckpt_file=FLAGS.ckpt_dir / "hk_hifi.pickle",
+):
     with open(config_file) as f:
         data = f.read()
     json_config = json.loads(data)
     rng = next(hk.PRNGSequence(42))
+    with open(ckpt_file, "rb") as f:
         params = pickle.load(f)
     aux = {}
     wav, aux = forward.apply(params, aux, rng, mel)
     wav = jnp.squeeze(wav)
     audio = jax.device_get(wav)
+    return

vietTTS/nat/text2mel.py CHANGED Viewed

@@ -19,12 +19,12 @@ def load_lexicon(fn):
     return dict(lines)
-def predict_duration(tokens):
     def fwd_(x):
         return DurationModel(is_training=False)(x)
     forward_fn = jax.jit(hk.transform_with_state(fwd_).apply)
-    with open(FLAGS.ckpt_dir / "duration_latest_ckpt.pickle", "rb") as f:
         dic = pickle.load(f)
     x = DurationInput(
         np.array(tokens, dtype=np.int32)[None, :],
@@ -58,8 +58,7 @@ def text2tokens(text, lexicon_fn):
     return tokens
-def predict_mel(tokens, durations):
-    ckpt_fn = FLAGS.ckpt_dir / "acoustic_latest_ckpt.pickle"
     with open(ckpt_fn, "rb") as f:
         dic = pickle.load(f)
         last_step, params, aux, rng, optim_state = (
@@ -83,10 +82,14 @@ def predict_mel(tokens, durations):
 def text2mel(
-    text: str, lexicon_fn=FLAGS.data_dir / "lexicon.txt", silence_duration: float = -1.0
 ):
     tokens = text2tokens(text, lexicon_fn)
-    durations = predict_duration(tokens)
     durations = jnp.where(
         np.array(tokens)[None, :] == FLAGS.sil_index,
         jnp.clip(durations, a_min=silence_duration, a_max=None),
@@ -95,7 +98,7 @@ def text2mel(
     durations = jnp.where(
         np.array(tokens)[None, :] == FLAGS.word_end_index, 0.0, durations
     )
-    mels = predict_mel(tokens, durations)
     if tokens[-1] == FLAGS.sil_index:
         end_silence = durations[0, -1].item()
         silence_frame = int(end_silence * FLAGS.sample_rate / (FLAGS.n_fft // 4))
@@ -114,4 +117,4 @@ if __name__ == "__main__":
     plt.savefig(str(args.output))
     plt.close()
     mel = jax.device_get(mel)
-    mel.tofile("clip.mel")

     return dict(lines)
+def predict_duration(tokens, ckpt_file):
     def fwd_(x):
         return DurationModel(is_training=False)(x)
     forward_fn = jax.jit(hk.transform_with_state(fwd_).apply)
+    with open(ckpt_file, "rb") as f:
         dic = pickle.load(f)
     x = DurationInput(
         np.array(tokens, dtype=np.int32)[None, :],
     return tokens
+def predict_mel(tokens, durations, ckpt_fn):
     with open(ckpt_fn, "rb") as f:
         dic = pickle.load(f)
         last_step, params, aux, rng, optim_state = (
 def text2mel(
+    text: str,
+    lexicon_fn=FLAGS.data_dir / "lexicon.txt",
+    silence_duration: float = -1.0,
+    acoustic_ckpt=FLAGS.ckpt_dir / "acoustic_latest_ckpt.pickle",
+    duration_ckpt=FLAGS.ckpt_dir / "duration_latest_ckpt.pickle",
 ):
     tokens = text2tokens(text, lexicon_fn)
+    durations = predict_duration(tokens, duration_ckpt)
     durations = jnp.where(
         np.array(tokens)[None, :] == FLAGS.sil_index,
         jnp.clip(durations, a_min=silence_duration, a_max=None),
     durations = jnp.where(
         np.array(tokens)[None, :] == FLAGS.word_end_index, 0.0, durations
     )
+    mels = predict_mel(tokens, durations, acoustic_ckpt)
     if tokens[-1] == FLAGS.sil_index:
         end_silence = durations[0, -1].item()
         silence_frame = int(end_silence * FLAGS.sample_rate / (FLAGS.n_fft // 4))
     plt.savefig(str(args.output))
     plt.close()
     mel = jax.device_get(mel)
+    mel.tofile("clip.mel")