Add MMS inference
Browse files- Dockerfile +8 -1
- app.py +30 -11
- mms.py +84 -0
Dockerfile
CHANGED
@@ -5,7 +5,7 @@ RUN apt-get update && apt-get install -y gnupg && \
|
|
5 |
echo "deb http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
|
6 |
echo "deb-src http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
|
7 |
apt-get update && \
|
8 |
-
apt-get -y install festival festvox-ca-ona-hts festvox-ca-pau-hts lame git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev
|
9 |
|
10 |
RUN git clone -b ca-to-pr https://github.com/projecte-aina/espeak-ng
|
11 |
|
@@ -31,7 +31,14 @@ COPY --chown=user models models
|
|
31 |
|
32 |
RUN pip install -r requirements.txt
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
COPY --chown=user engine.py .
|
|
|
35 |
COPY --chown=user festival.py .
|
36 |
COPY --chown=user app.py .
|
37 |
|
|
|
5 |
echo "deb http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
|
6 |
echo "deb-src http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
|
7 |
apt-get update && \
|
8 |
+
apt-get -y install festival festvox-ca-ona-hts festvox-ca-pau-hts lame git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev libatlas-base-dev gfortran
|
9 |
|
10 |
RUN git clone -b ca-to-pr https://github.com/projecte-aina/espeak-ng
|
11 |
|
|
|
31 |
|
32 |
RUN pip install -r requirements.txt
|
33 |
|
34 |
+
RUN git clone https://github.com/jaywalnut310/vits.git && \
|
35 |
+
cd vits && sed s/torch==1.6.0/torch==1.7.0/ requirements.txt > requirements.txt && pip install -r requirements.txt && cd monotonic_align && \
|
36 |
+
python setup.py build_ext --inplace && cd /home/user
|
37 |
+
|
38 |
+
ENV PYTHONPATH=$PYTHONPATH:/home/user/app/vits
|
39 |
+
|
40 |
COPY --chown=user engine.py .
|
41 |
+
COPY --chown=user mms.py .
|
42 |
COPY --chown=user festival.py .
|
43 |
COPY --chown=user app.py .
|
44 |
|
app.py
CHANGED
@@ -1,14 +1,11 @@
|
|
1 |
import tempfile
|
2 |
-
from typing import Optional
|
3 |
-
from TTS.config import load_config
|
4 |
import gradio as gr
|
5 |
-
import numpy as np
|
6 |
import os
|
7 |
-
from TTS.utils.manage import ModelManager
|
8 |
from TTS.utils.synthesizer import Synthesizer
|
9 |
from espeak_phonemizer import Phonemizer
|
10 |
from engine import Piper
|
11 |
from festival import festival_synthesize
|
|
|
12 |
|
13 |
MAX_TXT_LEN = 325
|
14 |
|
@@ -41,6 +38,9 @@ def carrega_collectivat():
|
|
41 |
def carrega_piper():
|
42 |
return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
|
43 |
|
|
|
|
|
|
|
44 |
|
45 |
model_bsc = carrega_bsc()
|
46 |
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
|
@@ -49,6 +49,10 @@ model_collectivat = carrega_collectivat()
|
|
49 |
|
50 |
model_piper = carrega_piper()
|
51 |
|
|
|
|
|
|
|
|
|
52 |
def tts(text, festival_voice, speaker_idx):
|
53 |
if len(text) > MAX_TXT_LEN:
|
54 |
text = text[:MAX_TXT_LEN]
|
@@ -60,9 +64,6 @@ def tts(text, festival_voice, speaker_idx):
|
|
60 |
wav_coll = model_collectivat.tts(text)
|
61 |
wav_piper = model_piper.synthesize(text)
|
62 |
|
63 |
-
#return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)
|
64 |
-
|
65 |
-
# return output
|
66 |
fp_bsc = ""
|
67 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
68 |
model_bsc.save_wav(wav_bsc, fp)
|
@@ -77,12 +78,20 @@ def tts(text, festival_voice, speaker_idx):
|
|
77 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
78 |
fp.write(wav_piper)
|
79 |
fp_piper = fp.name
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
|
82 |
|
83 |
fp_festival = festival_synthesize(text, festival_voice)
|
84 |
|
85 |
-
|
|
|
|
|
|
|
86 |
|
87 |
|
88 |
description="""
|
@@ -91,8 +100,11 @@ Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuro
|
|
91 |
1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker)
|
92 |
2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API)
|
93 |
3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper)
|
|
|
94 |
|
95 |
-
|
|
|
|
|
96 |
|
97 |
Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
|
98 |
https://github.com/projecte-aina/espeak-ng
|
@@ -116,13 +128,20 @@ iface = gr.Interface(
|
|
116 |
gr.Audio(label="Festival",type="filepath"),
|
117 |
gr.Audio(label="BSC VITS",type="filepath"),
|
118 |
gr.Audio(label="Collectivat Fastspeech",type="filepath"),
|
119 |
-
gr.Audio(label="Piper VITS",type="filepath")
|
|
|
120 |
],
|
121 |
title="Comparativa de síntesi lliure en català️",
|
122 |
description=description,
|
123 |
article=article,
|
124 |
allow_flagging="never",
|
125 |
layout="vertical",
|
126 |
-
live=False
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
)
|
128 |
iface.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
1 |
import tempfile
|
|
|
|
|
2 |
import gradio as gr
|
|
|
3 |
import os
|
|
|
4 |
from TTS.utils.synthesizer import Synthesizer
|
5 |
from espeak_phonemizer import Phonemizer
|
6 |
from engine import Piper
|
7 |
from festival import festival_synthesize
|
8 |
+
from mms import MMS
|
9 |
|
10 |
MAX_TXT_LEN = 325
|
11 |
|
|
|
38 |
def carrega_piper():
|
39 |
return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
|
40 |
|
41 |
+
def carrega_mms():
|
42 |
+
return MMS(os.getcwd() + "/models/mms")
|
43 |
+
|
44 |
|
45 |
model_bsc = carrega_bsc()
|
46 |
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
|
|
|
49 |
|
50 |
model_piper = carrega_piper()
|
51 |
|
52 |
+
model_mms = carrega_mms()
|
53 |
+
|
54 |
+
request_count = 0
|
55 |
+
|
56 |
def tts(text, festival_voice, speaker_idx):
|
57 |
if len(text) > MAX_TXT_LEN:
|
58 |
text = text[:MAX_TXT_LEN]
|
|
|
64 |
wav_coll = model_collectivat.tts(text)
|
65 |
wav_piper = model_piper.synthesize(text)
|
66 |
|
|
|
|
|
|
|
67 |
fp_bsc = ""
|
68 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
69 |
model_bsc.save_wav(wav_bsc, fp)
|
|
|
78 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
79 |
fp.write(wav_piper)
|
80 |
fp_piper = fp.name
|
81 |
+
|
82 |
+
fp_mms = ""
|
83 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
84 |
+
model_mms.synthesize(fp.name, text)
|
85 |
+
fp_mms = fp.name
|
86 |
|
87 |
fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
|
88 |
|
89 |
fp_festival = festival_synthesize(text, festival_voice)
|
90 |
|
91 |
+
global request_count
|
92 |
+
request_count += 1
|
93 |
+
print(f"Requests: {request_count}")
|
94 |
+
return fonemes, fp_festival, fp_bsc, fp_coll, fp_piper, fp_mms
|
95 |
|
96 |
|
97 |
description="""
|
|
|
100 |
1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker)
|
101 |
2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API)
|
102 |
3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper)
|
103 |
+
3. Model VITS entrenat per Meta (llicència CC-BY-NC) [enllaç](https://github.com/facebookresearch/fairseq/tree/main/examples/mms)
|
104 |
|
105 |
+
El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
|
106 |
+
Els models 2 i 3 han estat entrenats amb la veu d'Ona de FestCAT.
|
107 |
+
El model 4, anomenat MMS, de Meta (Facebook) ha estat entrenat a partir de dades d'un [audiollibre](http://live.bible.is/bible/CATBSS/LUK/1) de la Bíblia
|
108 |
|
109 |
Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
|
110 |
https://github.com/projecte-aina/espeak-ng
|
|
|
128 |
gr.Audio(label="Festival",type="filepath"),
|
129 |
gr.Audio(label="BSC VITS",type="filepath"),
|
130 |
gr.Audio(label="Collectivat Fastspeech",type="filepath"),
|
131 |
+
gr.Audio(label="Piper VITS",type="filepath"),
|
132 |
+
gr.Audio(label="Meta MMS VITS",type="filepath")
|
133 |
],
|
134 |
title="Comparativa de síntesi lliure en català️",
|
135 |
description=description,
|
136 |
article=article,
|
137 |
allow_flagging="never",
|
138 |
layout="vertical",
|
139 |
+
live=False,
|
140 |
+
examples=[
|
141 |
+
["Duc pà sec al sac, m'assec on sóc i el suco amb suc", "ona", "ona"],
|
142 |
+
["Un plat pla blanc, ple de pebre negre n’era. Un plat blanc pla, ple de pebre negre està", "ona", "ona"],
|
143 |
+
["Visc al bosc i busco vesc i visc del vesc que busco al bosc", "ona", "ona"],
|
144 |
+
["Una polla xica, pica, pellarica, camatorta i becarica va tenir sis polls xics, pics, pellarics, camacurts i becarics. Si la polla no hagués sigut xica, pica, pellarica, camatorta i becarica, els sis polls no haurien sigut xics, pics, pellarics, camacurts i becarics.", "ona", "ona"]
|
145 |
+
]
|
146 |
)
|
147 |
iface.launch(server_name="0.0.0.0", server_port=7860)
|
mms.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import torch
|
8 |
+
import commons
|
9 |
+
import utils
|
10 |
+
from models import SynthesizerTrn
|
11 |
+
from scipy.io.wavfile import write
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import Union
|
14 |
+
|
15 |
+
class TextMapper(object):
|
16 |
+
def __init__(self, vocab_file):
|
17 |
+
self.symbols = [x.replace("\n", "") for x in open(vocab_file).readlines()]
|
18 |
+
self.SPACE_ID = self.symbols.index(" ")
|
19 |
+
self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
|
20 |
+
self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
|
21 |
+
|
22 |
+
def text_to_sequence(self, text, cleaner_names):
|
23 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
24 |
+
Args:
|
25 |
+
text: string to convert to a sequence
|
26 |
+
cleaner_names: names of the cleaner functions to run the text through
|
27 |
+
Returns:
|
28 |
+
List of integers corresponding to the symbols in the text
|
29 |
+
'''
|
30 |
+
sequence = []
|
31 |
+
clean_text = text.strip()
|
32 |
+
for symbol in clean_text:
|
33 |
+
symbol_id = self._symbol_to_id[symbol]
|
34 |
+
sequence += [symbol_id]
|
35 |
+
return sequence
|
36 |
+
|
37 |
+
def get_text(self, text, hps):
|
38 |
+
text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
|
39 |
+
if hps.data.add_blank:
|
40 |
+
text_norm = commons.intersperse(text_norm, 0)
|
41 |
+
text_norm = torch.LongTensor(text_norm)
|
42 |
+
return text_norm
|
43 |
+
|
44 |
+
def filter_oov(self, text):
|
45 |
+
val_chars = self._symbol_to_id
|
46 |
+
txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
|
47 |
+
print(f"text after filtering OOV: {txt_filt}")
|
48 |
+
return txt_filt
|
49 |
+
|
50 |
+
class MMS():
|
51 |
+
def __init__(self, model_path: Union[str, Path]):
|
52 |
+
ckpt_dir = model_path
|
53 |
+
vocab_file = f"{ckpt_dir}/vocab.txt"
|
54 |
+
config_file = f"{ckpt_dir}/config.json"
|
55 |
+
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
|
56 |
+
self.hps = utils.get_hparams_from_file(config_file)
|
57 |
+
self.text_mapper = TextMapper(vocab_file)
|
58 |
+
self.net_g = SynthesizerTrn(
|
59 |
+
len(self.text_mapper.symbols),
|
60 |
+
self.hps.data.filter_length // 2 + 1,
|
61 |
+
self.hps.train.segment_size // self.hps.data.hop_length,
|
62 |
+
**self.hps.model)
|
63 |
+
g_pth = f"{ckpt_dir}/G_100000.pth"
|
64 |
+
print(f"load {g_pth}")
|
65 |
+
|
66 |
+
_ = utils.load_checkpoint(g_pth, self.net_g, None)
|
67 |
+
|
68 |
+
def synthesize(self, wav_path: str, txt):
|
69 |
+
print(f"text: {txt}")
|
70 |
+
txt = txt.lower()
|
71 |
+
txt = self.text_mapper.filter_oov(txt)
|
72 |
+
stn_tst = self.text_mapper.get_text(txt, self.hps)
|
73 |
+
with torch.no_grad():
|
74 |
+
x_tst = stn_tst.unsqueeze(0)
|
75 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
76 |
+
hyp = self.net_g.infer(
|
77 |
+
x_tst, x_tst_lengths, noise_scale=.667,
|
78 |
+
noise_scale_w=0.8, length_scale=1.0
|
79 |
+
)[0][0,0].cpu().float().numpy()
|
80 |
+
|
81 |
+
os.makedirs(os.path.dirname(wav_path), exist_ok=True)
|
82 |
+
print(f"wav: {wav_path}")
|
83 |
+
write(wav_path, self.hps.data.sampling_rate, hyp)
|
84 |
+
return
|