In [1]:
# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp310-cp310-linux_x86_64.whl

In [2]:
from awq import AutoAWQForCausalLM
from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM
import torch

model_path = 'mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3'

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)

In [4]:
!rm -rf test

In [5]:
model.save_pretrained('./test', safe_serialization = False)

In [6]:
model = AutoAWQForCausalLM.from_pretrained('./test')

In [7]:
quant_path = 'malaysian-tinyllama-1.1b-16k-instructions-v3-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')

AWQ: 100%|██████████| 22/22 [02:25<00:00, 6.59s/it]


In [8]:
model.save_quantized(quant_path, safetensors = False)
tokenizer.save_pretrained(quant_path)



('malaysian-tinyllama-1.1b-16k-instructions-v3-awq/tokenizer_config.json',
 'malaysian-tinyllama-1.1b-16k-instructions-v3-awq/special_tokens_map.json',
 'malaysian-tinyllama-1.1b-16k-instructions-v3-awq/tokenizer.json')

In [9]:
tokenizer.push_to_hub('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ')

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ/commit/909193caec2d94a495535d9033f5dcd975686356', commit_message='Upload tokenizer', commit_description='', oid='909193caec2d94a495535d9033f5dcd975686356', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
quantization_config = AwqConfig(
 bits=quant_config['w_bit'],
 group_size=quant_config['q_group_size'],
 zero_point=quant_config['zero_point'],
 backend='autoawq',
 version=quant_config['version'].lower(),
)

config = AutoConfig.from_pretrained(model_path)
config.quantization_config = quantization_config

config.push_to_hub('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ')

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3-AWQ/commit/a259a50f290eb1e648396698d9a0dcac7d33d5a2', commit_message='Upload config', commit_description='', oid='a259a50f290eb1e648396698d9a0dcac7d33d5a2', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
!ls malaysian-tinyllama-1.1b-16k-instructions-v3-awq

config.json		quant_config.json	 tokenizer_config.json
generation_config.json	special_tokens_map.json
pytorch_model.bin	tokenizer.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
from huggingface_hub import HfApi

api = HfApi()

In [13]:
%%time

generate_kwargs = dict(
 inputs,
 max_new_tokens=1024,
 top_p=0.95,
 top_k=50,
 temperature=0.9,
 do_sample=True,
 num_beams=1,
)
r = quantized_model.generate(**generate_kwargs)
tokenizer.decode(r[0])

pytorch_model.bin: 0%| | 0.00/766M [00:00 [INST] KWSP tu apa [/INST]KWSP, singkatan untuk Kumpulan Wang Simpanan Pekerja, ialah Kumpulan Wang Simpanan Pekerja di Malaysia, yang merupakan dana simpanan pekerja. KWSP bertujuan untuk menyediakan simpanan persaraan dan faedah keselamatan sosial untuk ahli KWSP (dana caruman dan majikan) dengan menyediakan pelaburan boleh beli. Dana ini diuruskan oleh KWSP, sebuah syarikat milik kerajaan di Malaysia. KWSP terdiri daripada simpanan caruman pekerja dan simpanan majikan, dengan peruntukan yang berkaitan dengan skim simpanan yang berbeza di Malaysia. Dana ini menggunakan pelaburan yang dibuat oleh pencarum dan pembayar caruman untuk menjana dividen dan faedah, yang seterusnya menyokong matlamat kerajaan untuk menyediakan simpanan persaraan yang selamat dan mencukupi untuk pekerja.'