--- base_model: - HKUST-Audio/Llasa-3B --- Sample inference script: ```py import re from argparse import ArgumentParser import torch import torchaudio from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config, ExLlamaV2Tokenizer from exllamav2.generator import ( ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler, ) from torchaudio import functional as F from xcodec2.modeling_xcodec2 import XCodec2Model parser = ArgumentParser() parser.add_argument("-m", "--model", type=str, required=True) parser.add_argument("-v", "--vocoder", type=str, required=True) parser.add_argument("-a", "--audio", type=str, required=True) parser.add_argument("-t", "--transcript", type=str, required=True) parser.add_argument("-i", "--input", type=str, required=True) parser.add_argument("-o", "--output", type=str, required=True) args = parser.parse_args() config = ExLlamaV2Config(args.model) config.max_seq_len = 2048 model = ExLlamaV2(config, lazy_load=True) cache = ExLlamaV2Cache(model, lazy=True) model.load_autosplit(cache) tokenizer = ExLlamaV2Tokenizer(config) generator = ExLlamaV2DynamicGenerator(model, cache, tokenizer) audio, sample_rate = torchaudio.load(args.audio) if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) if sample_rate != 16000: audio = F.resample(audio, sample_rate, 16000) vocoder = XCodec2Model.from_pretrained(args.vocoder) vocoder = vocoder.cuda().eval() input = vocoder.encode_code(audio) input = input[0, 0, :] input = [f"<|s_{i}|>" for i in input] input = "".join(input) prompt = ( "<|start_header_id|>user<|end_header_id|>\n\n" "Convert the text to speech:" "<|TEXT_UNDERSTANDING_START|>" f"{args.transcript}{args.input}" "<|TEXT_UNDERSTANDING_END|>" "<|eot_id|>\n" "<|start_header_id|>assistant<|end_header_id|>\n\n" "<|SPEECH_GENERATION_START|>" f"{input}" ) input_ids = tokenizer.encode(prompt, add_bos=True, encode_special_tokens=True) max_new_tokens = config.max_seq_len - input_ids.shape[-1] gen_settings = ExLlamaV2Sampler.Settings() gen_settings.temperature = 0.8 gen_settings.top_p = 1.0 stop_conditions = ["<|SPEECH_GENERATION_END|>"] job = ExLlamaV2DynamicJob( input_ids=input_ids, max_new_tokens=max_new_tokens, gen_settings=gen_settings, stop_conditions=stop_conditions, ) generator.enqueue(job) output = "" while generator.num_remaining_jobs(): for result in generator.iterate(): if result.get("stage") == "streaming": text = result.get("text", "") output += text if result.get("eos"): generator.clear_queue() output = [int(o) for o in re.findall(r"<\|s_(\d+)\|>", output)] output = torch.tensor([[output]]).cuda() output = vocoder.decode_code(output) output = output[0, 0, :] output = output.unsqueeze(0).cpu() torchaudio.save(args.output, output, 16000) ```