Abhinav Kulkarni
commited on
Commit
•
7928487
1
Parent(s):
db591f3
Updated README
Browse files
README.md
CHANGED
@@ -39,6 +39,7 @@ git clone https://github.com/mit-han-lab/llm-awq \
|
|
39 |
```
|
40 |
|
41 |
```python
|
|
|
42 |
import torch
|
43 |
from awq.quantize.quantizer import real_quantize_model_weight
|
44 |
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
|
@@ -80,6 +81,7 @@ prompt = f'''What is the difference between nuclear fusion and fission?
|
|
80 |
###Response:'''
|
81 |
|
82 |
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
|
|
|
83 |
output = model.generate(
|
84 |
inputs=input_ids,
|
85 |
temperature=0.7,
|
@@ -89,6 +91,9 @@ output = model.generate(
|
|
89 |
repetition_penalty=1.1,
|
90 |
eos_token_id=tokenizer.eos_token_id,
|
91 |
streamer=streamer)
|
|
|
|
|
|
|
92 |
```
|
93 |
|
94 |
## Evaluation
|
|
|
39 |
```
|
40 |
|
41 |
```python
|
42 |
+
import time
|
43 |
import torch
|
44 |
from awq.quantize.quantizer import real_quantize_model_weight
|
45 |
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
|
|
|
81 |
###Response:'''
|
82 |
|
83 |
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
|
84 |
+
t1 = time.time()
|
85 |
output = model.generate(
|
86 |
inputs=input_ids,
|
87 |
temperature=0.7,
|
|
|
91 |
repetition_penalty=1.1,
|
92 |
eos_token_id=tokenizer.eos_token_id,
|
93 |
streamer=streamer)
|
94 |
+
t2 = time.time()
|
95 |
+
print("*"*80)
|
96 |
+
print(f"Generated {num_tokens/(t2-t1):.2f} token/s; {(t2-t1)*1000/num_tokens:.2f} ms/token")
|
97 |
```
|
98 |
|
99 |
## Evaluation
|