Archan commited on
Commit
bbdb87d
1 Parent(s): 0950ce1

Update tts.py

Browse files
Files changed (1) hide show
  1. tts.py +31 -14
tts.py CHANGED
@@ -1,18 +1,35 @@
1
- from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
2
- from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
 
 
 
3
 
4
- models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
5
- "facebook/fastspeech2-en-ljspeech",
6
- arg_overrides={"vocoder": "hifigan", "fp16": False}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  )
8
- model = models
9
- TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
10
- generator = task.build_generator(model, cfg)
11
 
12
- def tts(text):
13
 
14
- print("Converting to TTS")
15
-
16
- sample = TTSHubInterface.get_model_input(task, text)
17
- wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
18
- return wave, rate
 
 
1
+ import time
2
+ import torch
3
+ import scipy.io.wavfile
4
+ from espnet2.bin.tts_inference import Text2Speech
5
+ from espnet2.utils.types import str_or_none
6
 
7
+ tagen = 'kan-bayashi/ljspeech_vits'
8
+ vocoder_tagen = "none"
9
+
10
+ text2speechen = Text2Speech.from_pretrained(
11
+ model_tag=str_or_none(tagen),
12
+ vocoder_tag=str_or_none(vocoder_tagen),
13
+ device="cpu",
14
+ # Only for Tacotron 2 & Transformer
15
+ threshold=0.5,
16
+ # Only for Tacotron 2
17
+ minlenratio=0.0,
18
+ maxlenratio=10.0,
19
+ use_att_constraint=False,
20
+ backward_window=1,
21
+ forward_window=3,
22
+ # Only for FastSpeech & FastSpeech2 & VITS
23
+ speed_control_alpha=1.0,
24
+ # Only for VITS
25
+ noise_scale=0.333,
26
+ noise_scale_dur=0.333,
27
  )
 
 
 
28
 
 
29
 
30
+ def inference(text,lang):
31
+ with torch.no_grad():
32
+ if lang == "english":
33
+ wav = text2speechen(text)["wav"]
34
+ scipy.io.wavfile.write("./audio/out.wav",text2speechen.fs , wav.view(-1).cpu().numpy())
35
+ return "./audio/out.wav"