Spaces:

datascientist22
/

exam-corrector-chatbot

Sleeping

datascientist22 commited on Sep 7

Commit

4ee92ad

•

1 Parent(s): eaf4e19

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,16 +2,16 @@ import streamlit as st
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-# Load the tokenizer and model for CPU (avoid bitsandbytes quantization)
 tokenizer = AutoTokenizer.from_pretrained("MohamedMotaz/Examination-llama-8b-4bit")
 model = AutoModelForCausalLM.from_pretrained(
-    "MohamedMotaz/Examination-llama-8b-4bit",
-    torch_dtype=torch.float32  # Use float32 to avoid 8-bit quantization
 )
-# Ensure the model runs on CPU
-model = model.to("cpu")
 # App Title
 st.title("Exam Corrector: Automated Grading with LLama 8b Model (CPU)")
@@ -32,8 +32,8 @@ if st.button("Grade Answer"):
     inputs = f"Model Answer: {model_answer}\n\nStudent Answer: {student_answer}\n\nResponse:"
     # Tokenize the inputs using PyTorch tensors
-    input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to("cpu")
     # Generate the response using the model (PyTorch, CPU-based)
     with torch.no_grad():
         outputs = model.generate(input_ids, max_length=200)

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+# Load the tokenizer and model for CPU without bitsandbytes
 tokenizer = AutoTokenizer.from_pretrained("MohamedMotaz/Examination-llama-8b-4bit")
+# Load the model in full precision, explicitly avoiding 8-bit quantization
 model = AutoModelForCausalLM.from_pretrained(
+    "MohamedMotaz/Examination-llama-8b-4bit",
+    torch_dtype=torch.float32,  # Ensure it uses full precision (float32)
+    device_map="cpu",  # Force the model to run on the CPU
 )
 # App Title
 st.title("Exam Corrector: Automated Grading with LLama 8b Model (CPU)")
     inputs = f"Model Answer: {model_answer}\n\nStudent Answer: {student_answer}\n\nResponse:"
     # Tokenize the inputs using PyTorch tensors
+    input_ids = tokenizer(inputs, return_tensors="pt").input_ids
     # Generate the response using the model (PyTorch, CPU-based)
     with torch.no_grad():
         outputs = model.generate(input_ids, max_length=200)