In [1]:
# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.6/autoawq-0.1.6+cu118-cp310-cp310-linux_x86_64.whl

In [2]:
!nvidia-smi

Tue Nov  7 14:32:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  On   | 00000001:00:00.0 Off |                    0 |
| N/A   37C    P0    66W / 300W |   5536MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
from awq import AutoAWQForCausalLM
from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM
import torch

model_path = 'mesolitica/malaysian-llama2-7b-32k-instructions'

[2023-11-07 14:32:32,101] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
# !pip3 install transformers==4.35.0

In [5]:
!rm -rf test

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
model.save_pretrained('./test', safe_serialization = False)

In [8]:
model = AutoAWQForCausalLM.from_pretrained('./test')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
quant_path = 'malaysian-llama2-7b-32k-instructions-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')

AWQ: 100%|██████████| 32/32 [08:38<00:00, 16.21s/it]


In [11]:
model.save_quantized(quant_path, safetensors = False)
tokenizer.save_pretrained(quant_path)



('malaysian-llama2-7b-32k-instructions-awq/tokenizer_config.json',
 'malaysian-llama2-7b-32k-instructions-awq/special_tokens_map.json',
 'malaysian-llama2-7b-32k-instructions-awq/tokenizer.model',
 'malaysian-llama2-7b-32k-instructions-awq/added_tokens.json',
 'malaysian-llama2-7b-32k-instructions-awq/tokenizer.json')

In [12]:
tokenizer.push_to_hub('mesolitica/malaysian-llama2-7b-32k-instructions-AWQ')

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-AWQ/commit/ea465a1be780a5091d89685d69ec7146ba0d69e4', commit_message='Upload tokenizer', commit_description='', oid='ea465a1be780a5091d89685d69ec7146ba0d69e4', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
quantization_config = AwqConfig(
    bits=quant_config['w_bit'],
    group_size=quant_config['q_group_size'],
    zero_point=quant_config['zero_point'],
    backend='autoawq',
    version=quant_config['version'].lower(),
)

config = AutoConfig.from_pretrained(model_path)
config.quantization_config = quantization_config

config.push_to_hub('mesolitica/malaysian-llama2-7b-32k-instructions-AWQ')

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-AWQ/commit/69be7a3e995592db52910fe2e848e85dc2637ad3', commit_message='Upload config', commit_description='', oid='69be7a3e995592db52910fe2e848e85dc2637ad3', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
from huggingface_hub import HfApi

api = HfApi()

In [17]:
api.upload_file(
    path_or_fileobj='malaysian-llama2-7b-32k-instructions-awq/pytorch_model.bin',
    path_in_repo="pytorch_model.bin",
    repo_id='mesolitica/malaysian-llama2-7b-32k-instructions-AWQ',
    repo_type="model",
)

pytorch_model.bin:   0%|          | 0.00/3.89G [00:00<?, ?B/s]

'https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-AWQ/blob/main/pytorch_model.bin'

In [18]:
quantized_model = AutoModelForCausalLM.from_pretrained('mesolitica/malaysian-llama2-7b-32k-instructions-AWQ')
_ = quantized_model.cuda()

Downloading (…)lve/main/config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.


Downloading pytorch_model.bin:   0%|          | 0.00/3.89G [00:00<?, ?B/s]

In [21]:
def parse_llama_chat(messages):

    system = messages[0]['content']
    user_query = messages[-1]['content']

    users, assistants = [], []
    for q in messages[1:-1]:
        if q['role'] == 'user':
            users.append(q['content'])
        elif q['role'] == 'assistant':
            assistants.append(q['content'])

    texts = [f'<s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n']
    for u, a in zip(users, assistants):
        texts.append(f'{u.strip()} [/INST] {a.strip()} </s><s>[INST] ')
    texts.append(f'{user_query.strip()} [/INST]')
    prompt = ''.join(texts).strip()
    return prompt

In [22]:
messages = [
    {'role': 'system', 'content': 'awak adalah AI yang mampu jawab segala soalan'},
    {'role': 'user', 'content': 'kwsp tu apa'}
]
prompt = parse_llama_chat(messages)
inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')

In [24]:
%%time

generate_kwargs = dict(
    inputs,
    max_new_tokens=1024,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)
r = quantized_model.generate(**generate_kwargs)
tokenizer.decode(r[0])

CPU times: user 7.54 s, sys: 3.84 ms, total: 7.54 s
Wall time: 7.54 s


'<s> [INST] <<SYS>>\nawak adalah AI yang mampu jawab segala soalan\n<</SYS>>\n\nkwsp tu apa [/INST] KWSP adalah singkatan bagi "Kumpulan Wang Simpanan Pekerja", yang merujuk kepada skim simpanan persaraan yang dilaksanakan di Malaysia yang bertujuan untuk menyediakan dana persaraan untuk pekerja dan majikan. Program ini memerlukan majikan untuk menyumbang sejumlah wang bagi pihak pekerja, dan pekerja dikehendaki menyumbang sejumlah yang sama bagi pihak mereka sendiri. Dana ini dikumpulkan dalam dana berasingan dan dikawal selia oleh kerajaan. KWSP menyediakan faedah persaraan kepada ahlinya, seperti pengeluaran, pelaburan, dan pencen. Skim ini terkenal kerana tadbir urusnya yang baik dan reputasinya sebagai salah satu dana simpanan persaraan terbesar dan paling dipercayai di Asia. </s>'