TheBloke commited on
Commit
df10d83
1 Parent(s): 6b5a365

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +53 -14
README.md CHANGED
@@ -34,6 +34,14 @@ It is the result of quantising to 4bit using [AutoGPTQ](https://github.com/PanQi
34
  * [2, 3, 4, 5, 6, 8-bit GGML models for CPU+GPU inference](https://huggingface.co/TheBloke/falcon-40b-instruct-GGML)
35
  * [Unquantised fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/tiiuae/falcon-40b-instruct)
36
 
 
 
 
 
 
 
 
 
37
  ## EXPERIMENTAL
38
 
39
  Please note this is an experimental GPTQ model. Support for it is currently quite limited.
@@ -97,24 +105,57 @@ pip install einops
97
 
98
  You can then run this example code:
99
  ```python
100
- import torch
101
- from transformers import AutoTokenizer
102
- from auto_gptq import AutoGPTQForCausalLM
103
 
104
- # Download the model from HF and store it locally, then reference its location here:
105
- quantized_model_dir = "/path/to/falcon40b-instruct-GPTQ"
 
106
 
107
- from transformers import AutoTokenizer
108
- tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=False)
109
 
110
- model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False, use_safetensors=True, torch_dtype=torch.float32, trust_remote_code=True)
111
 
112
- prompt = "Write a story about llamas"
113
- prompt_template = f"### Instruction: {prompt}\n### Response:"
114
 
115
- tokens = tokenizer(prompt_template, return_tensors="pt").to("cuda:0").input_ids
116
- output = model.generate(input_ids=tokens, max_new_tokens=100, do_sample=True, temperature=0.8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  print(tokenizer.decode(output[0]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  ```
119
 
120
  ## Provided files
@@ -204,8 +245,6 @@ for seq in sequences:
204
 
205
  ```
206
 
207
-
208
-
209
  # Model Card for Falcon-40B-Instruct
210
 
211
  ## Model Details
 
34
  * [2, 3, 4, 5, 6, 8-bit GGML models for CPU+GPU inference](https://huggingface.co/TheBloke/falcon-40b-instruct-GGML)
35
  * [Unquantised fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/tiiuae/falcon-40b-instruct)
36
 
37
+ ## Prompt template
38
+
39
+ ```
40
+ A helpful assistant who helps the user with any questions asked.
41
+ User: prompt
42
+ Assistant:
43
+ ```
44
+
45
  ## EXPERIMENTAL
46
 
47
  Please note this is an experimental GPTQ model. Support for it is currently quite limited.
 
105
 
106
  You can then run this example code:
107
  ```python
108
+ from transformers import AutoTokenizer, pipeline, logging
109
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
110
+ import argparse
111
 
112
+ model_name_or_path = "TheBloke/falcon-40b-instruct-GPTQ"
113
+ # You could also download the model locally, and access it there
114
+ # model_name_or_path = "/path/to/TheBloke_falcon-40b-instruct-GPTQ"
115
 
116
+ model_basename = "gptq_model-4bit--1g"
 
117
 
118
+ use_triton = False
119
 
120
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
 
121
 
122
+ model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
123
+ model_basename=model_basename,
124
+ use_safetensors=True,
125
+ trust_remote_code=True,
126
+ device="cuda:0",
127
+ use_triton=use_triton,
128
+ quantize_config=None)
129
+
130
+ prompt = "Tell me about AI"
131
+ prompt_template=f'''A helpful assistant who helps the user with any questions asked.
132
+ User: {prompt}
133
+ Assistant:''
134
+
135
+ print("\n\n*** Generate:")
136
+
137
+ input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
138
+ output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
139
  print(tokenizer.decode(output[0]))
140
+
141
+ # Inference can also be done using transformers' pipeline
142
+ # Note that if you use pipeline, you will see a spurious error message saying the model type is not supported
143
+ # This can be ignored! Or you can hide it with the following logging line:
144
+ # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
145
+ logging.set_verbosity(logging.CRITICAL)
146
+
147
+ print("*** Pipeline:")
148
+ pipe = pipeline(
149
+ "text-generation",
150
+ model=model,
151
+ tokenizer=tokenizer,
152
+ max_new_tokens=512,
153
+ temperature=0.7,
154
+ top_p=0.95,
155
+ repetition_penalty=1.15
156
+ )
157
+
158
+ print(pipe(prompt_template)[0]['generated_text'])
159
  ```
160
 
161
  ## Provided files
 
245
 
246
  ```
247
 
 
 
248
  # Model Card for Falcon-40B-Instruct
249
 
250
  ## Model Details