Abhinav Kulkarni
commited on
Commit
•
a4f281e
1
Parent(s):
ac682a3
Updated README
Browse files
README.md
CHANGED
@@ -41,6 +41,7 @@ git clone https://github.com/mit-han-lab/llm-awq \
|
|
41 |
```
|
42 |
|
43 |
```python
|
|
|
44 |
import torch
|
45 |
from awq.quantize.quantizer import real_quantize_model_weight
|
46 |
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
|
@@ -82,6 +83,7 @@ prompt = f'''What is the difference between nuclear fusion and fission?
|
|
82 |
###Response:'''
|
83 |
|
84 |
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
|
|
|
85 |
output = model.generate(
|
86 |
inputs=input_ids,
|
87 |
temperature=0.7,
|
@@ -91,6 +93,9 @@ output = model.generate(
|
|
91 |
repetition_penalty=1.1,
|
92 |
eos_token_id=tokenizer.eos_token_id,
|
93 |
streamer=streamer)
|
|
|
|
|
|
|
94 |
```
|
95 |
|
96 |
## Evaluation
|
|
|
41 |
```
|
42 |
|
43 |
```python
|
44 |
+
import time
|
45 |
import torch
|
46 |
from awq.quantize.quantizer import real_quantize_model_weight
|
47 |
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
|
|
|
83 |
###Response:'''
|
84 |
|
85 |
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
|
86 |
+
t1 = time.time()
|
87 |
output = model.generate(
|
88 |
inputs=input_ids,
|
89 |
temperature=0.7,
|
|
|
93 |
repetition_penalty=1.1,
|
94 |
eos_token_id=tokenizer.eos_token_id,
|
95 |
streamer=streamer)
|
96 |
+
t2 = time.time()
|
97 |
+
print("*"*80)
|
98 |
+
print(f"Generated {num_tokens/(t2-t1):.2f} token/s; {(t2-t1)*1000/num_tokens:.2f} ms/token")
|
99 |
```
|
100 |
|
101 |
## Evaluation
|