Spaces:

softcatala
/

comparativa-tts-catala

Running

App Files Files Community

ccoreilly commited on May 23, 2023

Commit

f64d86f

•

1 Parent(s): e2574bf

Add MMS inference

Browse files

Files changed (3) hide show

Dockerfile +8 -1
app.py +30 -11
mms.py +84 -0

Dockerfile CHANGED Viewed

@@ -5,7 +5,7 @@ RUN apt-get update && apt-get install -y gnupg && \
      echo "deb http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
      echo "deb-src http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
      apt-get update && \
-     apt-get -y install festival festvox-ca-ona-hts festvox-ca-pau-hts lame git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev
 RUN git clone -b ca-to-pr https://github.com/projecte-aina/espeak-ng
@@ -31,7 +31,14 @@ COPY --chown=user models models
 RUN pip install -r requirements.txt
 COPY --chown=user engine.py .
 COPY --chown=user festival.py .
 COPY --chown=user app.py .

      echo "deb http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
      echo "deb-src http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
      apt-get update && \
+     apt-get -y install festival festvox-ca-ona-hts festvox-ca-pau-hts lame git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev libatlas-base-dev gfortran
 RUN git clone -b ca-to-pr https://github.com/projecte-aina/espeak-ng
 RUN pip install -r requirements.txt
+RUN git clone https://github.com/jaywalnut310/vits.git && \
+ cd vits && sed s/torch==1.6.0/torch==1.7.0/ requirements.txt > requirements.txt && pip install -r requirements.txt && cd monotonic_align && \
+ python setup.py build_ext --inplace && cd /home/user
+ENV PYTHONPATH=$PYTHONPATH:/home/user/app/vits
 COPY --chown=user engine.py .
+COPY --chown=user mms.py .
 COPY --chown=user festival.py .
 COPY --chown=user app.py .

app.py CHANGED Viewed

@@ -1,14 +1,11 @@
 import tempfile
-from typing import Optional
-from TTS.config import load_config
 import gradio as gr
-import numpy as np
 import os
-from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 from espeak_phonemizer import Phonemizer
 from engine import Piper
 from festival import festival_synthesize
 MAX_TXT_LEN = 325
@@ -41,6 +38,9 @@ def carrega_collectivat():
 def carrega_piper():
     return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
 model_bsc = carrega_bsc()
 SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
@@ -49,6 +49,10 @@ model_collectivat = carrega_collectivat()
 model_piper = carrega_piper()
 def tts(text, festival_voice, speaker_idx):
     if len(text) > MAX_TXT_LEN:
         text = text[:MAX_TXT_LEN]
@@ -60,9 +64,6 @@ def tts(text, festival_voice, speaker_idx):
     wav_coll = model_collectivat.tts(text)
     wav_piper = model_piper.synthesize(text)
-    #return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)
-    # return output
     fp_bsc = ""
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
         model_bsc.save_wav(wav_bsc, fp)
@@ -77,12 +78,20 @@ def tts(text, festival_voice, speaker_idx):
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
         fp.write(wav_piper)
         fp_piper = fp.name
     fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
     fp_festival = festival_synthesize(text, festival_voice)
-    return fonemes, fp_festival, fp_bsc, fp_coll, fp_piper
 description="""
@@ -91,8 +100,11 @@ Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuro
 1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker)
 2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API)
 3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper)
-Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT. El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
 Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
 https://github.com/projecte-aina/espeak-ng
@@ -116,13 +128,20 @@ iface = gr.Interface(
         gr.Audio(label="Festival",type="filepath"),
         gr.Audio(label="BSC VITS",type="filepath"),
         gr.Audio(label="Collectivat Fastspeech",type="filepath"),
-        gr.Audio(label="Piper VITS",type="filepath")
     ],
     title="Comparativa de síntesi lliure en català️",
     description=description,
     article=article,
     allow_flagging="never",
     layout="vertical",
-    live=False
 )
 iface.launch(server_name="0.0.0.0", server_port=7860)

 import tempfile
 import gradio as gr
 import os
 from TTS.utils.synthesizer import Synthesizer
 from espeak_phonemizer import Phonemizer
 from engine import Piper
 from festival import festival_synthesize
+from mms import MMS
 MAX_TXT_LEN = 325
 def carrega_piper():
     return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
+def carrega_mms():
+    return MMS(os.getcwd() + "/models/mms")
 model_bsc = carrega_bsc()
 SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
 model_piper = carrega_piper()
+model_mms = carrega_mms()
+request_count = 0
 def tts(text, festival_voice, speaker_idx):
     if len(text) > MAX_TXT_LEN:
         text = text[:MAX_TXT_LEN]
     wav_coll = model_collectivat.tts(text)
     wav_piper = model_piper.synthesize(text)
     fp_bsc = ""
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
         model_bsc.save_wav(wav_bsc, fp)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
         fp.write(wav_piper)
         fp_piper = fp.name
+    fp_mms = ""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        model_mms.synthesize(fp.name, text)
+        fp_mms = fp.name
     fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
     fp_festival = festival_synthesize(text, festival_voice)
+    global request_count
+    request_count += 1
+    print(f"Requests: {request_count}")
+    return fonemes, fp_festival, fp_bsc, fp_coll, fp_piper, fp_mms
 description="""
 1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker)
 2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API)
 3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper)
+3. Model VITS entrenat per Meta (llicència CC-BY-NC) [enllaç](https://github.com/facebookresearch/fairseq/tree/main/examples/mms)
+El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
+Els models 2 i 3 han estat entrenats amb la veu d'Ona de FestCAT.
+El model 4, anomenat MMS, de Meta (Facebook) ha estat entrenat a partir de dades d'un [audiollibre](http://live.bible.is/bible/CATBSS/LUK/1) de la Bíblia
 Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
 https://github.com/projecte-aina/espeak-ng
         gr.Audio(label="Festival",type="filepath"),
         gr.Audio(label="BSC VITS",type="filepath"),
         gr.Audio(label="Collectivat Fastspeech",type="filepath"),
+        gr.Audio(label="Piper VITS",type="filepath"),
+        gr.Audio(label="Meta MMS VITS",type="filepath")
     ],
     title="Comparativa de síntesi lliure en català️",
     description=description,
     article=article,
     allow_flagging="never",
     layout="vertical",
+    live=False,
+    examples=[
+        ["Duc pà sec al sac, m'assec on sóc i el suco amb suc", "ona", "ona"],
+        ["Un plat pla blanc, ple de pebre negre n’era. Un plat blanc pla, ple de pebre negre està", "ona", "ona"],
+        ["Visc al bosc i busco vesc i visc del vesc que busco al bosc", "ona", "ona"],
+        ["Una polla xica, pica, pellarica, camatorta i becarica va tenir sis polls xics, pics, pellarics, camacurts i becarics. Si la polla no hagués sigut xica, pica, pellarica, camatorta i becarica, els sis polls no haurien sigut xics, pics, pellarics, camacurts i becarics.", "ona", "ona"]
+    ]
 )
 iface.launch(server_name="0.0.0.0", server_port=7860)

mms.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torch
+import commons
+import utils
+from models import SynthesizerTrn
+from scipy.io.wavfile import write
+from pathlib import Path
+from typing import Union
+class TextMapper(object):
+    def __init__(self, vocab_file):
+        self.symbols = [x.replace("\n", "") for x in open(vocab_file).readlines()]
+        self.SPACE_ID = self.symbols.index(" ")
+        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
+        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
+    def text_to_sequence(self, text, cleaner_names):
+        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+        Args:
+        text: string to convert to a sequence
+        cleaner_names: names of the cleaner functions to run the text through
+        Returns:
+        List of integers corresponding to the symbols in the text
+        '''
+        sequence = []
+        clean_text = text.strip()
+        for symbol in clean_text:
+            symbol_id = self._symbol_to_id[symbol]
+            sequence += [symbol_id]
+        return sequence
+    def get_text(self, text, hps):
+        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
+        if hps.data.add_blank:
+            text_norm = commons.intersperse(text_norm, 0)
+        text_norm = torch.LongTensor(text_norm)
+        return text_norm
+    def filter_oov(self, text):
+        val_chars = self._symbol_to_id
+        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
+        print(f"text after filtering OOV: {txt_filt}")
+        return txt_filt
+class MMS():
+    def __init__(self, model_path: Union[str, Path]):
+        ckpt_dir = model_path
+        vocab_file = f"{ckpt_dir}/vocab.txt"
+        config_file = f"{ckpt_dir}/config.json"
+        assert os.path.isfile(config_file), f"{config_file} doesn't exist"
+        self.hps = utils.get_hparams_from_file(config_file)
+        self.text_mapper = TextMapper(vocab_file)
+        self.net_g = SynthesizerTrn(
+            len(self.text_mapper.symbols),
+            self.hps.data.filter_length // 2 + 1,
+            self.hps.train.segment_size // self.hps.data.hop_length,
+            **self.hps.model)
+        g_pth = f"{ckpt_dir}/G_100000.pth"
+        print(f"load {g_pth}")
+        _ = utils.load_checkpoint(g_pth, self.net_g, None)
+    def synthesize(self, wav_path: str, txt):
+        print(f"text: {txt}")
+        txt = txt.lower()
+        txt = self.text_mapper.filter_oov(txt)
+        stn_tst = self.text_mapper.get_text(txt, self.hps)
+        with torch.no_grad():
+            x_tst = stn_tst.unsqueeze(0)
+            x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
+            hyp = self.net_g.infer(
+                x_tst, x_tst_lengths, noise_scale=.667,
+                noise_scale_w=0.8, length_scale=1.0
+            )[0][0,0].cpu().float().numpy()
+        os.makedirs(os.path.dirname(wav_path), exist_ok=True)
+        print(f"wav: {wav_path}")
+        write(wav_path, self.hps.data.sampling_rate, hyp)
+        return