k4d3
/

toolkit

Model card Files Files and versions Community

toolkit / maskgct.py

k4d3's picture

ooh, thats a lot of stuff to explain...

516099b 16 days ago

3.26 kB

	from huggingface_hub import hf_hub_download
	import torch
	import safetensors
	import soundfile as sf

	if __name__ == "__main__":

	# build model
	device = torch.device("cuda:0")
	cfg_path = "./models/tts/maskgct/config/maskgct.json"
	cfg = load_config(cfg_path)
	# 1. build semantic model (w2v-bert-2.0)
	semantic_model, semantic_mean, semantic_std = build_semantic_model(device)
	# 2. build semantic codec
	semantic_codec = build_semantic_codec(cfg.model.semantic_codec, device)
	# 3. build acoustic codec
	codec_encoder, codec_decoder = build_acoustic_codec(cfg.model.acoustic_codec, device)
	# 4. build t2s model
	t2s_model = build_t2s_model(cfg.model.t2s_model, device)
	# 5. build s2a model
	s2a_model_1layer = build_s2a_model(cfg.model.s2a_model.s2a_1layer, device)
	s2a_model_full = build_s2a_model(cfg.model.s2a_model.s2a_full, device)

	# download checkpoint
	# download semantic codec ckpt
	semantic_code_ckpt = hf_hub_download("amphion/MaskGCT", filename="semantic_codec/model.safetensors")

	# download acoustic codec ckpt
	codec_encoder_ckpt = hf_hub_download("amphion/MaskGCT", filename="acoustic_codec/model.safetensors")
	codec_decoder_ckpt = hf_hub_download("amphion/MaskGCT", filename="acoustic_codec/model_1.safetensors")

	# download t2s model ckpt
	t2s_model_ckpt = hf_hub_download("amphion/MaskGCT", filename="t2s_model/model.safetensors")

	# download s2a model ckpt
	s2a_1layer_ckpt = hf_hub_download("amphion/MaskGCT", filename="s2a_model/s2a_model_1layer/model.safetensors")
	s2a_full_ckpt = hf_hub_download("amphion/MaskGCT", filename="s2a_model/s2a_model_full/model.safetensors")

	# load semantic codec
	safetensors.torch.load_model(semantic_codec, semantic_code_ckpt)
	# load acoustic codec
	safetensors.torch.load_model(codec_encoder, codec_encoder_ckpt)
	safetensors.torch.load_model(codec_decoder, codec_decoder_ckpt)
	# load t2s model
	safetensors.torch.load_model(t2s_model, t2s_model_ckpt)
	# load s2a model
	safetensors.torch.load_model(s2a_model_1layer, s2a_1layer_ckpt)
	safetensors.torch.load_model(s2a_model_full, s2a_full_ckpt)

	# inference
	prompt_wav_path = "/home/kade/maskgct/alan_watts_sample_2.wav"
	save_path = "/home/kade/maskgct/save"
	prompt_text = "About the basic ideas which as westerners living today in the United States influence our everyday common sense, our fundamental notions about what life is about and there are historical"
	target_text = "I really like sucking on otter dicks. Subscribe to my fucking youtube channel!"
	# Specify the target duration (in seconds). If target_len = None, we use a simple rule to predict the target duration.
	target_len = 18

	maskgct_inference_pipeline = MaskGCT_Inference_Pipeline(
	semantic_model,
	semantic_codec,
	codec_encoder,
	codec_decoder,
	t2s_model,
	s2a_model_1layer,
	s2a_model_full,
	semantic_mean,
	semantic_std,
	device,
	)

	recovered_audio = maskgct_inference_pipeline.maskgct_inference(
	prompt_wav_path, prompt_text, target_text, "en", "en", target_len=target_len
	)
	sf.write(save_path, recovered_audio, 24000)