import torch import soundfile as sf from transformers import AutoConfig from modeling_xcodec2 import XCodec2Model model_path = "/data/zheny/xcodec2" # 这是你在 huggingface 上的仓库名 model = XCodec2Model.from_pretrained(model_path) model.eval().cuda() # 准备一段音频 wav, sr = sf.read("test.flac") wav_tensor = torch.from_numpy(wav).float().unsqueeze(0) # [1, time] with torch.no_grad(): vq_code = model.encode_code(input_waveform=wav_tensor ) print(vq_code) recon_wav = model.decode_code(vq_code).cpu() sf.write("reconstructed.wav", recon_wav[0,0,:].numpy(), sr)