gunjanjoshi commited on
Commit
347544f
·
1 Parent(s): 69a54ae

Added accelerate

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -6,6 +6,7 @@ import gradio as gr
6
  # Loading PEFT model
7
  PEFT_MODEL = "gunjanjoshi/llama2-7b-sharded-bf16-finetuned-mental-health-conversational"
8
 
 
9
  bnb_config = BitsAndBytesConfig(
10
  load_in_4bit=True,
11
  bnb_4bit_quant_type="nf4",
@@ -18,7 +19,7 @@ peft_base_model = AutoModelForCausalLM.from_pretrained(
18
  config.base_model_name_or_path,
19
  return_dict=True,
20
  quantization_config=bnb_config,
21
- device_map="auto",
22
  trust_remote_code=True,
23
  )
24
 
@@ -35,9 +36,9 @@ system_message = """You are a helpful and and truthful psychology and psychother
35
 
36
  def generate_response(user_input):
37
  formatted = f"<s>[INST] <<SYS>>{system_message}<</SYS>>{user_input} [/INST]"
38
- input_ids = peft_tokenizer(formatted, return_tensors="pt", truncation=True, max_length=1024).input_ids.cuda()
39
  outputs = peft_model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, temperature=0.95, max_length=2048)
40
- translated_output = peft_tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(formatted)-1:]
41
  return translated_output
42
 
43
  with gr.Blocks() as demo:
 
6
  # Loading PEFT model
7
  PEFT_MODEL = "gunjanjoshi/llama2-7b-sharded-bf16-finetuned-mental-health-conversational"
8
 
9
+ # Modify BitsAndBytesConfig for CPU
10
  bnb_config = BitsAndBytesConfig(
11
  load_in_4bit=True,
12
  bnb_4bit_quant_type="nf4",
 
19
  config.base_model_name_or_path,
20
  return_dict=True,
21
  quantization_config=bnb_config,
22
+ device_map="cpu", # Ensure this is set to CPU
23
  trust_remote_code=True,
24
  )
25
 
 
36
 
37
  def generate_response(user_input):
38
  formatted = f"<s>[INST] <<SYS>>{system_message}<</SYS>>{user_input} [/INST]"
39
+ input_ids = peft_tokenizer(formatted, return_tensors="pt", truncation=True, max_length=1024).input_ids
40
  outputs = peft_model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, temperature=0.95, max_length=2048)
41
+ translated_output = peft_tokenizer.batch_decode(outputs.detach().numpy(), skip_special_tokens=True)[0][len(formatted)-1:]
42
  return translated_output
43
 
44
  with gr.Blocks() as demo: