Abhinav Kulkarni commited on
Commit
a4f281e
1 Parent(s): ac682a3

Updated README

Browse files
Files changed (1) hide show
  1. README.md +5 -0
README.md CHANGED
@@ -41,6 +41,7 @@ git clone https://github.com/mit-han-lab/llm-awq \
41
  ```
42
 
43
  ```python
 
44
  import torch
45
  from awq.quantize.quantizer import real_quantize_model_weight
46
  from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
@@ -82,6 +83,7 @@ prompt = f'''What is the difference between nuclear fusion and fission?
82
  ###Response:'''
83
 
84
  input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
 
85
  output = model.generate(
86
  inputs=input_ids,
87
  temperature=0.7,
@@ -91,6 +93,9 @@ output = model.generate(
91
  repetition_penalty=1.1,
92
  eos_token_id=tokenizer.eos_token_id,
93
  streamer=streamer)
 
 
 
94
  ```
95
 
96
  ## Evaluation
 
41
  ```
42
 
43
  ```python
44
+ import time
45
  import torch
46
  from awq.quantize.quantizer import real_quantize_model_weight
47
  from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextStreamer
 
83
  ###Response:'''
84
 
85
  input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
86
+ t1 = time.time()
87
  output = model.generate(
88
  inputs=input_ids,
89
  temperature=0.7,
 
93
  repetition_penalty=1.1,
94
  eos_token_id=tokenizer.eos_token_id,
95
  streamer=streamer)
96
+ t2 = time.time()
97
+ print("*"*80)
98
+ print(f"Generated {num_tokens/(t2-t1):.2f} token/s; {(t2-t1)*1000/num_tokens:.2f} ms/token")
99
  ```
100
 
101
  ## Evaluation