Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -85,8 +85,8 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
|
|
85 |
attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
|
86 |
|
87 |
generate_kwargs = dict(
|
88 |
-
input_ids=input_ids
|
89 |
-
attention_mask=attention_mask
|
90 |
streamer=streamer,
|
91 |
do_sample=True,
|
92 |
temperature=temperature,
|
@@ -115,7 +115,7 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
115 |
model = AutoModelForCausalLM.from_pretrained(
|
116 |
MODEL_ID,
|
117 |
device_map="auto",
|
118 |
-
|
119 |
#attn_implementation="flash_attention_2",
|
120 |
)
|
121 |
|
|
|
85 |
attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
|
86 |
|
87 |
generate_kwargs = dict(
|
88 |
+
input_ids=input_ids,
|
89 |
+
attention_mask=attention_mask,
|
90 |
streamer=streamer,
|
91 |
do_sample=True,
|
92 |
temperature=temperature,
|
|
|
115 |
model = AutoModelForCausalLM.from_pretrained(
|
116 |
MODEL_ID,
|
117 |
device_map="auto",
|
118 |
+
quantization_config=quantization_config,
|
119 |
#attn_implementation="flash_attention_2",
|
120 |
)
|
121 |
|