Annuvin commited on
Commit
16efd07
·
verified ·
1 Parent(s): 15e90eb

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +153 -0
README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - HKUSTAudio/Llasa-3B
4
+ ---
5
+
6
+ # Sample Inference Script
7
+ ```py
8
+ import random
9
+ import re
10
+ from argparse import ArgumentParser
11
+
12
+ import torch
13
+ import torchaudio
14
+ from exllamav2 import (
15
+ ExLlamaV2,
16
+ ExLlamaV2Cache,
17
+ ExLlamaV2Config,
18
+ ExLlamaV2Tokenizer,
19
+ Timer,
20
+ )
21
+ from exllamav2.generator import (
22
+ ExLlamaV2DynamicGenerator,
23
+ ExLlamaV2DynamicJob,
24
+ ExLlamaV2Sampler,
25
+ )
26
+ from rich import print
27
+ from torchaudio import functional as F
28
+ from xcodec2.modeling_xcodec2 import XCodec2Model
29
+
30
+ parser = ArgumentParser()
31
+ parser.add_argument("-m", "--model", required=True)
32
+ parser.add_argument("-v", "--vocoder", required=True)
33
+ parser.add_argument("-i", "--input", required=True)
34
+ parser.add_argument("-a", "--audio", default="")
35
+ parser.add_argument("-t", "--transcript", default="")
36
+ parser.add_argument("-o", "--output", default="output.wav")
37
+ parser.add_argument("-d", "--debug", action="store_true")
38
+ parser.add_argument("--max_seq_len", type=int, default=2048)
39
+ parser.add_argument("--sample_rate", type=int, default=16000)
40
+ parser.add_argument("--seed", type=int, default=None)
41
+ parser.add_argument("--temperature", type=float, default=0.8)
42
+ parser.add_argument("--top_p", type=float, default=1.0)
43
+ args = parser.parse_args()
44
+
45
+ with Timer() as timer:
46
+ config = ExLlamaV2Config(args.model)
47
+ config.max_seq_len = args.max_seq_len
48
+ model = ExLlamaV2(config, lazy_load=True)
49
+ cache = ExLlamaV2Cache(model, lazy=True)
50
+ model.load_autosplit(cache, progress=True)
51
+ tokenizer = ExLlamaV2Tokenizer(config)
52
+ generator = ExLlamaV2DynamicGenerator(model, cache, tokenizer)
53
+
54
+ print(f"Loaded model in {timer.interval:.2f} seconds.")
55
+
56
+ with Timer() as timer:
57
+ vocoder = XCodec2Model.from_pretrained(args.vocoder)
58
+ vocoder = vocoder.cuda().eval()
59
+
60
+ print(f"Loaded vocoder in {timer.interval:.2f} seconds.")
61
+
62
+ if args.audio and args.transcript:
63
+ with Timer() as timer:
64
+ transcript = f"{args.transcript} "
65
+ audio, sample_rate = torchaudio.load(args.audio)
66
+ audio = audio.cuda()
67
+
68
+ if audio.shape[0] > 1:
69
+ audio = torch.mean(audio, dim=0, keepdim=True)
70
+
71
+ if sample_rate != args.sample_rate:
72
+ audio = F.resample(audio, sample_rate, args.sample_rate)
73
+
74
+ print(f"Loaded audio in {timer.interval:.2f} seconds.")
75
+
76
+ with Timer() as timer:
77
+ audio = vocoder.encode_code(audio)
78
+ audio = audio[0, 0, :]
79
+ audio = [f"<|s_{a}|>" for a in audio]
80
+ audio = "".join(audio)
81
+
82
+ print(f"Encoded audio in {timer.interval:.2f} seconds.")
83
+ else:
84
+ transcript = ""
85
+ audio = ""
86
+
87
+ with Timer() as timer:
88
+ input = (
89
+ "<|start_header_id|>user<|end_header_id|>\n\n"
90
+ "Convert the text to speech:"
91
+ "<|TEXT_UNDERSTANDING_START|>"
92
+ f"{transcript}{args.input}"
93
+ "<|TEXT_UNDERSTANDING_END|>"
94
+ "<|eot_id|>\n"
95
+ "<|start_header_id|>assistant<|end_header_id|>\n\n"
96
+ "<|SPEECH_GENERATION_START|>"
97
+ f"{audio}"
98
+ )
99
+
100
+ input_ids = tokenizer.encode(input, add_bos=True, encode_special_tokens=True)
101
+
102
+ print(f"Encoded input in {timer.interval:.2f} seconds.")
103
+
104
+ with Timer() as timer:
105
+ max_new_tokens = config.max_seq_len - input_ids.shape[-1]
106
+ gen_settings = ExLlamaV2Sampler.Settings()
107
+ gen_settings.temperature = args.temperature
108
+ gen_settings.top_p = args.top_p
109
+ seed = args.seed if args.seed else random.randint(0, 2**64 - 1)
110
+ stop_conditions = ["<|SPEECH_GENERATION_END|>"]
111
+
112
+ job = ExLlamaV2DynamicJob(
113
+ input_ids=input_ids,
114
+ max_new_tokens=max_new_tokens,
115
+ gen_settings=gen_settings,
116
+ seed=seed,
117
+ stop_conditions=stop_conditions,
118
+ decode_special_tokens=True,
119
+ )
120
+
121
+ generator.enqueue(job)
122
+ output = []
123
+
124
+ while generator.num_remaining_jobs():
125
+ for result in generator.iterate():
126
+ if result.get("stage") == "streaming":
127
+ text = result.get("text", "")
128
+ output.append(text)
129
+
130
+ if args.debug:
131
+ print(text, end="", flush=True)
132
+
133
+ if result.get("eos"):
134
+ generator.clear_queue()
135
+
136
+ if args.debug:
137
+ print()
138
+
139
+ print(
140
+ f"Generated {len(output)} tokens with seed {seed} in {timer.interval:.2f} seconds."
141
+ )
142
+
143
+ with Timer() as timer:
144
+ output = "".join(output)
145
+ output = [int(o) for o in re.findall(r"<\|s_(\d+)\|>", output)]
146
+ output = torch.tensor([[output]]).cuda()
147
+ output = vocoder.decode_code(output)
148
+ output = output[0, 0, :]
149
+ output = output.unsqueeze(0).cpu()
150
+ torchaudio.save(args.output, output, args.sample_rate)
151
+
152
+ print(f"Decoded audio in {timer.interval:.2f} seconds.")
153
+ ```