|
from huggingface_hub import hf_hub_download |
|
import torch |
|
import safetensors |
|
import soundfile as sf |
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
device = torch.device("cuda:0") |
|
cfg_path = "./models/tts/maskgct/config/maskgct.json" |
|
cfg = load_config(cfg_path) |
|
|
|
semantic_model, semantic_mean, semantic_std = build_semantic_model(device) |
|
|
|
semantic_codec = build_semantic_codec(cfg.model.semantic_codec, device) |
|
|
|
codec_encoder, codec_decoder = build_acoustic_codec(cfg.model.acoustic_codec, device) |
|
|
|
t2s_model = build_t2s_model(cfg.model.t2s_model, device) |
|
|
|
s2a_model_1layer = build_s2a_model(cfg.model.s2a_model.s2a_1layer, device) |
|
s2a_model_full = build_s2a_model(cfg.model.s2a_model.s2a_full, device) |
|
|
|
|
|
|
|
semantic_code_ckpt = hf_hub_download("amphion/MaskGCT", filename="semantic_codec/model.safetensors") |
|
|
|
|
|
codec_encoder_ckpt = hf_hub_download("amphion/MaskGCT", filename="acoustic_codec/model.safetensors") |
|
codec_decoder_ckpt = hf_hub_download("amphion/MaskGCT", filename="acoustic_codec/model_1.safetensors") |
|
|
|
|
|
t2s_model_ckpt = hf_hub_download("amphion/MaskGCT", filename="t2s_model/model.safetensors") |
|
|
|
|
|
s2a_1layer_ckpt = hf_hub_download("amphion/MaskGCT", filename="s2a_model/s2a_model_1layer/model.safetensors") |
|
s2a_full_ckpt = hf_hub_download("amphion/MaskGCT", filename="s2a_model/s2a_model_full/model.safetensors") |
|
|
|
|
|
safetensors.torch.load_model(semantic_codec, semantic_code_ckpt) |
|
|
|
safetensors.torch.load_model(codec_encoder, codec_encoder_ckpt) |
|
safetensors.torch.load_model(codec_decoder, codec_decoder_ckpt) |
|
|
|
safetensors.torch.load_model(t2s_model, t2s_model_ckpt) |
|
|
|
safetensors.torch.load_model(s2a_model_1layer, s2a_1layer_ckpt) |
|
safetensors.torch.load_model(s2a_model_full, s2a_full_ckpt) |
|
|
|
|
|
prompt_wav_path = "/home/kade/maskgct/alan_watts_sample_2.wav" |
|
save_path = "/home/kade/maskgct/save" |
|
prompt_text = "About the basic ideas which as westerners living today in the United States influence our everyday common sense, our fundamental notions about what life is about and there are historical" |
|
target_text = "I really like sucking on otter dicks. Subscribe to my fucking youtube channel!" |
|
|
|
target_len = 18 |
|
|
|
maskgct_inference_pipeline = MaskGCT_Inference_Pipeline( |
|
semantic_model, |
|
semantic_codec, |
|
codec_encoder, |
|
codec_decoder, |
|
t2s_model, |
|
s2a_model_1layer, |
|
s2a_model_full, |
|
semantic_mean, |
|
semantic_std, |
|
device, |
|
) |
|
|
|
recovered_audio = maskgct_inference_pipeline.maskgct_inference( |
|
prompt_wav_path, prompt_text, target_text, "en", "en", target_len=target_len |
|
) |
|
sf.write(save_path, recovered_audio, 24000) |
|
|
|
|