File size: 4,223 Bytes
2ef3e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import time
import numpy as np
from rkllm_binding import *
from rknnlite.api.rknn_lite import RKNNLite
from transformers import WhisperFeatureExtractor
import signal
import cv2
import librosa


MODEL_PATH = "qwen.rkllm"
AUDIO_ENCODER_PATH = "audio_encoder.rknn"
handle = None
img_size = 448

# exit on ctrl-c
def signal_handler(signal, frame):
    print("Ctrl-C pressed, exiting...")
    global handle
    if handle:
        abort(handle)
        destroy(handle)
    exit(0)

signal.signal(signal.SIGINT, signal_handler)

# export RKLLM_LOG_LEVEL=1
os.environ["RKLLM_LOG_LEVEL"] = "1"

inference_count = 0
inference_start_time = 0
def result_callback(result, userdata, state):
    global inference_start_time
    global inference_count
    if state == LLMCallState.RKLLM_RUN_NORMAL:
        if inference_count == 0:
            first_token_time = time.time()
            print(f"Time to first token: {first_token_time - inference_start_time:.2f} seconds")
        inference_count += 1
        print(result.contents.text.decode(), end="", flush=True)
    elif state == LLMCallState.RKLLM_RUN_FINISH:
        print("\n\n(finished)")
    elif state == LLMCallState.RKLLM_RUN_ERROR:
        print("\nError occurred during LLM call")

feature_extractor = WhisperFeatureExtractor.from_pretrained(".")

# Initialize audio encoder
audio_encoder = RKNNLite(verbose=True)
model_size = os.path.getsize(AUDIO_ENCODER_PATH)
print(f"Start loading audio encoder model (size: {model_size / 1024 / 1024:.2f} MB)")
start_time = time.time()
audio_encoder.load_rknn(AUDIO_ENCODER_PATH)
end_time = time.time()
print(f"Audio encoder loaded in {end_time - start_time:.2f} seconds (speed: {model_size / (end_time - start_time) / 1024 / 1024:.2f} MB/s)")
audio_encoder.init_runtime()

# Initialize RKLLM
param = create_default_param()
param.model_path = MODEL_PATH.encode()
param.img_start = "<|audio_bos|>".encode()
param.img_end = "<|audio_eos|>".encode()
param.img_content = "<|AUDIO|>".encode()
param.max_context_len = 1024
extend_param = RKLLMExtendParam()
extend_param.base_domain_id = 1  # iommu domain 0 for audio encoder
param.extend_param = extend_param
model_size = os.path.getsize(MODEL_PATH)
print(f"Start loading language model (size: {model_size / 1024 / 1024:.2f} MB)")
start_time = time.time()
handle = init(param, result_callback)
end_time = time.time()
print(f"Language model loaded in {end_time - start_time:.2f} seconds (speed: {model_size / (end_time - start_time) / 1024 / 1024:.2f} MB/s)")


# audio embedding
audio_path = "glass-breaking.mp3"


print("Start inference...")
audio, _ = librosa.load(audio_path, sr=feature_extractor.sampling_rate)
feature_extractor_output = feature_extractor(
    audio, 
    sampling_rate=feature_extractor.sampling_rate,
    return_attention_mask=True,
    padding="max_length"
)

print(feature_extractor_output.input_features.shape)
start_time = time.time()
audio_embeddings = audio_encoder.inference(inputs=[
    feature_extractor_output.input_features.astype(np.float32),
    feature_extractor_output.attention_mask.astype(np.float32)
], data_format="nhwc")[0].astype(np.float32)
end_time = time.time()
print(f"Audio encoder inference time: {end_time - start_time:.2f} seconds")
print(audio_embeddings.flags)
print(audio_embeddings.shape)

# Create input. RKLLM is stupid enough to hardcode the <image> tag for embedding.
prompt = """<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Audio 1: <image>
这是什么声音? <|im_end|>
<|im_start|>assistant
"""

# # # 2.56->3.25>2.41->10.2
# # image_embeddings = np.load("image_embeddings_pth_orig.npy")
# # image_embeddings = np.ascontiguousarray(image_embeddings, dtype=np.float32)
# # print(f"Loaded embeddings shape: {image_embeddings.shape}")

# # rkllm_input = create_rkllm_input(RKLLMInputType.RKLLM_INPUT_EMBED, embed=image_embeddings)

rkllm_input = create_rkllm_input(RKLLMInputType.RKLLM_INPUT_MULTIMODAL, prompt=prompt, image_embed=audio_embeddings)

# Create inference parameters
infer_param = RKLLMInferParam()
infer_param.mode = RKLLMInferMode.RKLLM_INFER_GENERATE.value

# Run RKLLM
inference_start_time = time.time()
run(handle, rkllm_input, infer_param, None)

# Clean up
destroy(handle)