Configuration Parsing Warning: In config.json: "quantization_config.bits" must be an integer

Sample inference script:

import re
from argparse import ArgumentParser

import torch
import torchaudio
from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config, ExLlamaV2Tokenizer
from exllamav2.generator import (
    ExLlamaV2DynamicGenerator,
    ExLlamaV2DynamicJob,
    ExLlamaV2Sampler,
)
from torchaudio import functional as F
from xcodec2.modeling_xcodec2 import XCodec2Model

parser = ArgumentParser()
parser.add_argument("-m", "--model", type=str, required=True)
parser.add_argument("-v", "--vocoder", type=str, required=True)
parser.add_argument("-a", "--audio", type=str, required=True)
parser.add_argument("-t", "--transcript", type=str, required=True)
parser.add_argument("-i", "--input", type=str, required=True)
parser.add_argument("-o", "--output", type=str, required=True)
args = parser.parse_args()

config = ExLlamaV2Config(args.model)
config.max_seq_len = 2048

model = ExLlamaV2(config, lazy_load=True)
cache = ExLlamaV2Cache(model, lazy=True)
model.load_autosplit(cache)

tokenizer = ExLlamaV2Tokenizer(config)
generator = ExLlamaV2DynamicGenerator(model, cache, tokenizer)

audio, sample_rate = torchaudio.load(args.audio)

if audio.shape[0] > 1:
    audio = torch.mean(audio, dim=0, keepdim=True)

if sample_rate != 16000:
    audio = F.resample(audio, sample_rate, 16000)

vocoder = XCodec2Model.from_pretrained(args.vocoder)
vocoder = vocoder.cuda().eval()

input = vocoder.encode_code(audio)
input = input[0, 0, :]
input = [f"<|s_{i}|>" for i in input]
input = "".join(input)

prompt = (
    "<|start_header_id|>user<|end_header_id|>\n\n"
    "Convert the text to speech:"
    "<|TEXT_UNDERSTANDING_START|>"
    f"{args.transcript}{args.input}"
    "<|TEXT_UNDERSTANDING_END|>"
    "<|eot_id|>\n"
    "<|start_header_id|>assistant<|end_header_id|>\n\n"
    "<|SPEECH_GENERATION_START|>"
    f"{input}"
)

input_ids = tokenizer.encode(prompt, add_bos=True, encode_special_tokens=True)
max_new_tokens = config.max_seq_len - input_ids.shape[-1]

gen_settings = ExLlamaV2Sampler.Settings()
gen_settings.temperature = 0.8
gen_settings.top_p = 1.0

stop_conditions = ["<|SPEECH_GENERATION_END|>"]

job = ExLlamaV2DynamicJob(
    input_ids=input_ids,
    max_new_tokens=max_new_tokens,
    gen_settings=gen_settings,
    stop_conditions=stop_conditions,
)

generator.enqueue(job)
output = ""

while generator.num_remaining_jobs():
    for result in generator.iterate():
        if result.get("stage") == "streaming":
            text = result.get("text", "")
            output += text

        if result.get("eos"):
            generator.clear_queue()

output = [int(o) for o in re.findall(r"<\|s_(\d+)\|>", output)]
output = torch.tensor([[output]]).cuda()
output = vocoder.decode_code(output)
output = output[0, 0, :]
output = output.unsqueeze(0).cpu()

torchaudio.save(args.output, output, 16000)
Downloads last month
2
Inference API
Unable to determine this model's library. Check the docs .

Model tree for Annuvin/Llasa-3B-6.5bpw-h8-exl2

Quantized
(4)
this model