Abhinav Kulkarni
commited on
Commit
•
54d79ef
1
Parent(s):
9feafc8
Updated README
Browse files
README.md
CHANGED
@@ -44,6 +44,7 @@ git clone https://github.com/mit-han-lab/llm-awq \
|
|
44 |
```
|
45 |
|
46 |
```python
|
|
|
47 |
import torch
|
48 |
from awq.quantize.quantizer import real_quantize_model_weight
|
49 |
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
|
@@ -85,6 +86,7 @@ prompt = f'''What is the difference between nuclear fusion and fission?
|
|
85 |
###Response:'''
|
86 |
|
87 |
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
|
|
|
88 |
output = model.generate(
|
89 |
inputs=input_ids,
|
90 |
temperature=0.7,
|
@@ -94,6 +96,9 @@ output = model.generate(
|
|
94 |
repetition_penalty=1.1,
|
95 |
eos_token_id=tokenizer.eos_token_id,
|
96 |
streamer=streamer)
|
|
|
|
|
|
|
97 |
```
|
98 |
|
99 |
## Evaluation
|
|
|
44 |
```
|
45 |
|
46 |
```python
|
47 |
+
import time
|
48 |
import torch
|
49 |
from awq.quantize.quantizer import real_quantize_model_weight
|
50 |
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
|
|
|
86 |
###Response:'''
|
87 |
|
88 |
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
|
89 |
+
t1 = time.time()
|
90 |
output = model.generate(
|
91 |
inputs=input_ids,
|
92 |
temperature=0.7,
|
|
|
96 |
repetition_penalty=1.1,
|
97 |
eos_token_id=tokenizer.eos_token_id,
|
98 |
streamer=streamer)
|
99 |
+
t2 = time.time()
|
100 |
+
print("*"*80)
|
101 |
+
print(f"Generated {num_tokens/(t2-t1):.2f} token/s; {(t2-t1)*1000/num_tokens:.2f} ms/token")
|
102 |
```
|
103 |
|
104 |
## Evaluation
|