chatbot-educativo

Sleeping

App Files Files Community

ignacio commited on 27 days ago

Commit

6830e68

•

1 Parent(s): d6077fd

v1 solo api

Browse files

Files changed (3) hide show

app.py +48 -27
config.py +16 -6
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,8 +4,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import pandas as pd
 from datetime import datetime, timedelta, timezone
 import torch
-from config import hugging_face_token, init_google_sheets_client, models, quantized_models, default_model_name, user_names, google_sheets_name, MAX_INTERACTIONS
 import spaces
 # Hack for ZeroGPU
 torch.jit.script = lambda f: f
@@ -39,14 +40,18 @@ tokenizer, model = None, None
 # Initialize the data list
 data = []
 # Load the model and tokenizer once at the beginning
 def load_model(model_name):
-    global tokenizer, model, selected_model
     try:
         # Release the memory of the previous model if exists
-        if model is not None:
-            del model
-            torch.cuda.empty_cache()
         # Check if the model is in models or quantized_models and load accordingly
         if model_name in models:
@@ -68,44 +73,45 @@ def load_model(model_name):
             tokenizer.pad_token = tokenizer.eos_token
             tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            token=hugging_face_token,
-            trust_remote_code=True
-        )
-        # Only move to CUDA if it's not a quantized model
-        if model_name not in quantized_models:
-            model = model.to("cuda")
         selected_model = model_name
     except Exception as e:
         print(f"Error loading model {model_name}: {e}")
         raise e
-    return tokenizer, model
 # Ensure the initial model is loaded
-tokenizer, model = load_model(selected_model)
 # Chat history
 chat_history = []
 # Function to handle interaction with model
-@spaces.GPU
 def interact(user_input, history, interaction_count, model_name):
     global tokenizer, model
     try:
         if tokenizer is None or model is None:
             raise ValueError("Tokenizer or model is not initialized.")
-        # Determine the device to use (either CUDA if available, or CPU)
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Only move the model to the device if it's not a quantized model
-        if model_name not in quantized_models:
-            model = model.to(device)
         if interaction_count >= MAX_INTERACTIONS:
             user_input += ". Thank you for your questions. Our session is now over. Goodbye!"
@@ -120,9 +126,24 @@ def interact(user_input, history, interaction_count, model_name):
         prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         # Move input tensor to the correct device
-        input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
-        chat_history_ids = model.generate(input_ids, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id, temperature=0.1)
-        response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
         # Update chat history with generated response
         history.append({"role": "user", "content": user_input})

 import pandas as pd
 from datetime import datetime, timedelta, timezone
 import torch
+from config import hugging_face_token, replicate_token, init_google_sheets_client, models, replicate_model, quantized_models, default_model_name, user_names, google_sheets_name, MAX_INTERACTIONS
 import spaces
+import replicate
 # Hack for ZeroGPU
 torch.jit.script = lambda f: f
 # Initialize the data list
 data = []
+#Initialize replicate client
+replicate_api = replicate.Client(api_token=replicate_token)
 # Load the model and tokenizer once at the beginning
 def load_model(model_name):
+    global tokenizer, selected_model #model
     try:
         # Release the memory of the previous model if exists
+        #no corresponde para API
+        #if model is not None:
+        #    del model
+        #    torch.cuda.empty_cache()
         # Check if the model is in models or quantized_models and load accordingly
         if model_name in models:
             tokenizer.pad_token = tokenizer.eos_token
             tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
+        #model = AutoModelForCausalLM.from_pretrained(
+        #    model_path,
+        #    token=hugging_face_token,
+        #    trust_remote_code=True
+        #)
+        #
+        ## Only move to CUDA if it's not a quantized model
+        #if model_name not in quantized_models:
+        #    model = model.to("cuda")
+        #
         selected_model = model_name
     except Exception as e:
         print(f"Error loading model {model_name}: {e}")
         raise e
+    return tokenizer#, model
 # Ensure the initial model is loaded
+#ahora solo load tokenizer
+tokenizer = load_model(selected_model) #, model
 # Chat history
 chat_history = []
 # Function to handle interaction with model
+#@spaces.GPU
 def interact(user_input, history, interaction_count, model_name):
     global tokenizer, model
     try:
         if tokenizer is None or model is None:
             raise ValueError("Tokenizer or model is not initialized.")
+        ## Determine the device to use (either CUDA if available, or CPU)
+        #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        #
+        ## Only move the model to the device if it's not a quantized model
+        #if model_name not in quantized_models:
+        #    model = model.to(device)
         if interaction_count >= MAX_INTERACTIONS:
             user_input += ". Thank you for your questions. Our session is now over. Goodbye!"
         prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         # Move input tensor to the correct device
+        #input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
+        # Generate request
+        inpt = {"prompt": prompt,
+                 "max_new_tokens": 100,
+                 "temperature": 0.1,
+                 "prompt_template": "{prompt}",}
+                 #"num_return_sequences": 1,
+                 #"pad_token_id": tokenizer.eos_token_id}
+        #make request
+        response = replicate_api.run(
+            replicate_model[model],
+            input=inpt
+        )
+        response = "".join(response).strip()
+        #chat_history_ids = model.generate(input_ids, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id, temperature=0.1)
+        #response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
         # Update chat history with generated response
         history.append({"role": "user", "content": user_input})

config.py CHANGED Viewed

@@ -4,6 +4,7 @@ from oauth2client.service_account import ServiceAccountCredentials
 # Read the authentication token from the environment variable
 hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")
 # Google Sheets configuration
 def init_google_sheets_client():
@@ -15,17 +16,26 @@ def init_google_sheets_client():
 google_sheets_name = "Chatbot Test"
 # Define available models
-models = {
     "Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
     "Llama-2-7B-Chat": "meta-llama/Llama-2-7b-chat-hf",
-    "Yi-6B-Chat": "01-ai/Yi-6B-Chat",
-    "Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
 }
-# List of models fine-tuned in 4-bit or 8-bit
-quantized_models = {
-    "Llama-3-8B-Finetuning-Stories": "rodrisouza/Llama-3-8B-Finetuning-Stories",
 }
 # Default model name
 default_model_name = "Meta-Llama-3-8B-Instruct"

 # Read the authentication token from the environment variable
 hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")
+replicate_token = os.getenv("REPLICATE_TOKEN")
 # Google Sheets configuration
 def init_google_sheets_client():
 google_sheets_name = "Chatbot Test"
 # Define available models
+huggingface_tokenizer = {
     "Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
     "Llama-2-7B-Chat": "meta-llama/Llama-2-7b-chat-hf",
+    #"Yi-6B-Chat": "01-ai/Yi-6B-Chat",
+    #"Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
 }
+#Avaiable models for replicate
+replicate_model= {
+    "Meta-Llama-3-8B-Instruct": "meta/Meta-Llama-3-8B-Instruct",
+    "Llama-2-7B-Chat": "meta/Llama-2-7b-chat-hf",
+    #"Yi-6B-Chat": "01-ai/yi-34b-chat",
+    #"Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
 }
+# List of models fine-tuned in 4-bit or 8-bit
+#quantized_models = {
+#    "Llama-3-8B-Finetuning-Stories": "rodrisouza/Llama-3-8B-Finetuning-Stories",
+#}
 # Default model name
 default_model_name = "Meta-Llama-3-8B-Instruct"

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ pandas
 gspread
 oauth2client
 accelerate
-bitsandbytes

 gspread
 oauth2client
 accelerate
+bitsandbytes
+replicate