Spaces:

gunjanjoshi
/

SafePost-Mental_health_counselor_chatbot

Runtime error

gunjanjoshi commited on Jun 22, 2024

Commit

347544f

1 Parent(s): 69a54ae

Added accelerate

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import gradio as gr
 # Loading PEFT model
 PEFT_MODEL = "gunjanjoshi/llama2-7b-sharded-bf16-finetuned-mental-health-conversational"
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -18,7 +19,7 @@ peft_base_model = AutoModelForCausalLM.from_pretrained(
     config.base_model_name_or_path,
     return_dict=True,
     quantization_config=bnb_config,
-    device_map="auto",
     trust_remote_code=True,
 )
@@ -35,9 +36,9 @@ system_message = """You are a helpful and and truthful psychology and psychother
 def generate_response(user_input):
     formatted = f"<s>[INST] <<SYS>>{system_message}<</SYS>>{user_input} [/INST]"
-    input_ids = peft_tokenizer(formatted, return_tensors="pt", truncation=True, max_length=1024).input_ids.cuda()
     outputs = peft_model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, temperature=0.95, max_length=2048)
-    translated_output = peft_tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(formatted)-1:]
     return translated_output
 with gr.Blocks() as demo:

 # Loading PEFT model
 PEFT_MODEL = "gunjanjoshi/llama2-7b-sharded-bf16-finetuned-mental-health-conversational"
+# Modify BitsAndBytesConfig for CPU
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     config.base_model_name_or_path,
     return_dict=True,
     quantization_config=bnb_config,
+    device_map="cpu",  # Ensure this is set to CPU
     trust_remote_code=True,
 )
 def generate_response(user_input):
     formatted = f"<s>[INST] <<SYS>>{system_message}<</SYS>>{user_input} [/INST]"
+    input_ids = peft_tokenizer(formatted, return_tensors="pt", truncation=True, max_length=1024).input_ids
     outputs = peft_model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, temperature=0.95, max_length=2048)
+    translated_output = peft_tokenizer.batch_decode(outputs.detach().numpy(), skip_special_tokens=True)[0][len(formatted)-1:]
     return translated_output
 with gr.Blocks() as demo: