File size: 3,772 Bytes
19d18f7 e46f1e5 19d18f7 e46f1e5 19d18f7 e46f1e5 19d18f7 e46f1e5 19d18f7 e46f1e5 19d18f7 e46f1e5 19d18f7 e46f1e5 19d18f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
---
tags:
- npu
- amd
- llama3
---
This is a model that has been AWQ quantized and converted to run on the NPU installed in the Ryzen AI PC (for example, Ryzen 9 7940HS Processor) (for Windows environment)
For information on setting up Ryzen AI for LLMs in window 11, see [Running LLM on AMD NPU Hardware](https://www.hackster.io/gharada2013/running-llm-on-amd-npu-hardware-19322f).
The following sample assumes that the setup on the above page has been completed.
### setup
In cmd windows.
```
conda activate ryzenai-transformers
<your_install_path>\RyzenAI-SW\example\transformers\setup.bat
git lfs install
git clone https://huggingface.co/dahara1/llama3-8b-amd-npu
cd llama3-8b-amd-npu
git lfs pull
cd ..
copy <your_install_path>\RyzenAI-SW\example\transformers\models\llama2\modeling_llama_amd.py .
# set up Runtime. see [Runtime Setup](https://ryzenai.docs.amd.com/en/latest/runtime_setup.html)
set XLNX_VART_FIRMWARE=<your_install_path>\voe-4.0-win_amd64\1x4.xclbin
set NUM_OF_DPU_RUNNERS=1
# save below sample script as utf8 and llama-3-test.py
python llama3-test.py
```
### Sample Script
```
import torch
import time
import os
import psutil
import transformers
from transformers import AutoTokenizer, set_seed
import qlinear
import logging
set_seed(123)
transformers.logging.set_verbosity_error()
logging.disable(logging.CRITICAL)
messages = [
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
]
message_list = [
"Who are you? ",
# Japanese
"あなたの乗っている船の名前は何ですか?英語ではなく全て日本語だけを使って返事をしてください",
# Chainese
"你经历过的最危险的冒险是什么?请用中文回答所有问题,不要用英文。",
# French
"À quelle vitesse va votre bateau ? Veuillez répondre uniquement en français et non en anglais.",
# Korean
"당신은 그 배의 어디를 좋아합니까? 영어를 사용하지 않고 모두 한국어로 대답하십시오.",
# German
"Wie würde Ihr Schiffsname auf Deutsch lauten? Bitte antwortet alle auf Deutsch statt auf Englisch.",
# Taiwanese
"您發現過的最令人驚奇的寶藏是什麼?請僅使用台語和繁體中文回答,不要使用英文。",
]
if __name__ == "__main__":
p = psutil.Process()
p.cpu_affinity([0, 1, 2, 3])
torch.set_num_threads(4)
tokenizer = AutoTokenizer.from_pretrained("llama3-8b-amd-npu")
ckpt = "llama3-8b-amd-npu/pytorch_llama3_8b_w_bit_4_awq_lm_amd.pt"
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
model = torch.load(ckpt)
model.eval()
model = model.to(torch.bfloat16)
for n, m in model.named_modules():
if isinstance(m, qlinear.QLinearPerGrp):
print(f"Preparing weights of layer : {n}")
m.device = "aie"
m.quantize_weights()
print("system: " + messages[0]['content'])
for i in range(len(message_list)):
messages.append({"role": "user", "content": message_list[i]})
print("user: " + message_list[i])
input = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True
)
outputs = model.generate(input['input_ids'],
max_new_tokens=600,
eos_token_id=terminators,
attention_mask=input['attention_mask'],
do_sample=True,
temperature=0.6,
top_p=0.9)
response = outputs[0][input['input_ids'].shape[-1]:]
response_message = tokenizer.decode(response, skip_special_tokens=True)
print("assistant: " + response_message)
messages.append({"role": "system", "content": response_message})
```
|