from turtle import title import gradio as gr import git import os os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS') os.system('pip install -q -e TTS/') os.system('pip install -q torchaudio==0.9.0') import sys TTS_PATH = "TTS/" # add libraries into environment sys.path.append(TTS_PATH) # set this if TTS is not installed globally import os import string import time import argparse import json import numpy as np import IPython from IPython.display import Audio import torch from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols try: from TTS.utils.audio import AudioProcessor except: from TTS.utils.audio import AudioProcessor from TTS.tts.models import setup_model from TTS.config import load_config from TTS.tts.models.vits import * os.system('pip install voicefixer --upgrade') from voicefixer import VoiceFixer voicefixer = VoiceFixer() import openai import torchaudio from speechbrain.pretrained import SpectralMaskEnhancement enhance_model = SpectralMaskEnhancement.from_hparams( source="speechbrain/metricgan-plus-voicebank", savedir="pretrained_models/metricgan-plus-voicebank", run_opts={"device":"cuda"}, ) mes1 = [ {"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."} ] mes2 = [ {"role": "system", "content": "You are a mental health therapist. Your name is Tina."} ] mes3 = [ {"role": "system", "content": "You are my personal assistant. Your name is Alice."} ] OUT_PATH = 'out/' # create output path os.makedirs(OUT_PATH, exist_ok=True) # model vars MODEL_PATH = '/home/user/app/best_model_latest.pth.tar' CONFIG_PATH = '/home/user/app/config.json' TTS_LANGUAGES = "/home/user/app/language_ids.json" TTS_SPEAKERS = "/home/user/app/speakers.json" USE_CUDA = torch.cuda.is_available() # load the config C = load_config(CONFIG_PATH) # load the audio processor ap = AudioProcessor(**C.audio) speaker_embedding = None C.model_args['d_vector_file'] = TTS_SPEAKERS C.model_args['use_speaker_encoder_as_loss'] = False model = setup_model(C) model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) # print(model.language_manager.num_languages, model.embedded_language_dim) # print(model.emb_l) cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) # remove speaker encoder model_weights = cp['model'].copy() for key in list(model_weights.keys()): if "speaker_encoder" in key: del model_weights[key] model.load_state_dict(model_weights) model.eval() if USE_CUDA: model = model.cuda() # synthesize voice use_griffin_lim = False os.system('pip install -q pydub ffmpeg-normalize') CONFIG_SE_PATH = "config_se.json" CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" from TTS.tts.utils.speakers import SpeakerManager from pydub import AudioSegment import librosa SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) def compute_spec(ref_file): y, sr = librosa.load(ref_file, sr=ap.sample_rate) spec = ap.spectrogram(y) spec = torch.FloatTensor(spec).unsqueeze(0) return spec def greet(apikey, Voicetoclone, VoiceMicrophone, Texts, choice1): openai.api_key = apikey if choice1 == "TOEFL": messages = mes1 elif choice1 == "Therapist": messages = mes2 elif choice1 == "Alice": messages = mes3 # chatgpt content = Texts messages.append({"role": "user", "content": content}) completion = openai.ChatCompletion.create( model = "gpt-3.5-turbo", messages = messages ) chat_response = completion.choices[0].message.content messages.append({"role": "assistant", "content": chat_response}) text= "%s" % (chat_response) if Voicetoclone is not None: reference_files= "%s" % (Voicetoclone) print("path url") print(Voicetoclone) sample= str(Voicetoclone) else: reference_files= "%s" % (VoiceMicrophone) print("path url") print(VoiceMicrophone) sample= str(VoiceMicrophone) size= len(reference_files)*sys.getsizeof(reference_files) size2= size / 1000000 if (size2 > 0.012) or len(text)>2000: message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes." print(message) raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.") else: os.system('ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f') reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files) model.length_scale = 1 # scaler for the duration predictor. The larger it is, the slower the speech. model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference. model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference. text = text model.language_manager.language_id_mapping language_id = 0 print(" > text: {}".format(text)) wav, alignment, _, _ = synthesis( model, text, C, "cuda" in str(next(model.parameters()).device), ap, speaker_id=None, d_vector=reference_emb, style_wav=None, language_id=language_id, enable_eos_bos_chars=C.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, ).values() print("Generated Audio") IPython.display.display(Audio(wav, rate=ap.sample_rate)) #file_name = text.replace(" ", "_") #file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' file_name="Audio.wav" out_path = os.path.join(OUT_PATH, file_name) print(" > Saving output to {}".format(out_path)) ap.save_wav(wav, out_path) voicefixer.restore(input=out_path, # input wav file path output="audio1.wav", # output wav file path cuda=True, # whether to use gpu acceleration mode = 0) # You can try out mode 0, 1 to find out the best result noisy = enhance_model.load_audio( "audio1.wav" ).unsqueeze(0) enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) return [result.text, chat_response, "enhanced.wav"] output_1 = gr.Textbox(label="Speech to Text") output_2 = gr.Textbox(label="ChatGPT Output") output_3 = gr.Audio(label="Audio with Custom Voice") gr.Interface( title = '🥳💬💕 - TalktoAI,随时随地,谈天说地!', theme="huggingface", description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!", fn=greet, inputs=[ gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"), gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"), gr.Audio(source="microphone", streaming = True, label = "请用语音上传您喜欢的声音,语音和文件上传二选一即可", type="filepath"), gr.Textbox(lines=3, label = "请开始对话吧!"), gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"), ], outputs=[ output_1, output_2, output_3 ], ).launch()