ignacio commited on
Commit
6830e68
1 Parent(s): d6077fd

v1 solo api

Browse files
Files changed (3) hide show
  1. app.py +48 -27
  2. config.py +16 -6
  3. requirements.txt +2 -1
app.py CHANGED
@@ -4,8 +4,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import pandas as pd
5
  from datetime import datetime, timedelta, timezone
6
  import torch
7
- from config import hugging_face_token, init_google_sheets_client, models, quantized_models, default_model_name, user_names, google_sheets_name, MAX_INTERACTIONS
8
  import spaces
 
9
 
10
  # Hack for ZeroGPU
11
  torch.jit.script = lambda f: f
@@ -39,14 +40,18 @@ tokenizer, model = None, None
39
  # Initialize the data list
40
  data = []
41
 
 
 
 
42
  # Load the model and tokenizer once at the beginning
43
  def load_model(model_name):
44
- global tokenizer, model, selected_model
45
  try:
46
  # Release the memory of the previous model if exists
47
- if model is not None:
48
- del model
49
- torch.cuda.empty_cache()
 
50
 
51
  # Check if the model is in models or quantized_models and load accordingly
52
  if model_name in models:
@@ -68,44 +73,45 @@ def load_model(model_name):
68
  tokenizer.pad_token = tokenizer.eos_token
69
  tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
70
 
71
- model = AutoModelForCausalLM.from_pretrained(
72
- model_path,
73
- token=hugging_face_token,
74
- trust_remote_code=True
75
- )
76
-
77
- # Only move to CUDA if it's not a quantized model
78
- if model_name not in quantized_models:
79
- model = model.to("cuda")
80
-
81
  selected_model = model_name
82
  except Exception as e:
83
  print(f"Error loading model {model_name}: {e}")
84
  raise e
85
- return tokenizer, model
86
 
87
 
88
 
89
  # Ensure the initial model is loaded
90
- tokenizer, model = load_model(selected_model)
 
91
 
92
  # Chat history
93
  chat_history = []
94
 
95
  # Function to handle interaction with model
96
- @spaces.GPU
97
  def interact(user_input, history, interaction_count, model_name):
98
  global tokenizer, model
99
  try:
100
  if tokenizer is None or model is None:
101
  raise ValueError("Tokenizer or model is not initialized.")
102
 
103
- # Determine the device to use (either CUDA if available, or CPU)
104
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
105
-
106
- # Only move the model to the device if it's not a quantized model
107
- if model_name not in quantized_models:
108
- model = model.to(device)
109
 
110
  if interaction_count >= MAX_INTERACTIONS:
111
  user_input += ". Thank you for your questions. Our session is now over. Goodbye!"
@@ -120,9 +126,24 @@ def interact(user_input, history, interaction_count, model_name):
120
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
121
 
122
  # Move input tensor to the correct device
123
- input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
124
- chat_history_ids = model.generate(input_ids, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id, temperature=0.1)
125
- response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  # Update chat history with generated response
128
  history.append({"role": "user", "content": user_input})
 
4
  import pandas as pd
5
  from datetime import datetime, timedelta, timezone
6
  import torch
7
+ from config import hugging_face_token, replicate_token, init_google_sheets_client, models, replicate_model, quantized_models, default_model_name, user_names, google_sheets_name, MAX_INTERACTIONS
8
  import spaces
9
+ import replicate
10
 
11
  # Hack for ZeroGPU
12
  torch.jit.script = lambda f: f
 
40
  # Initialize the data list
41
  data = []
42
 
43
+ #Initialize replicate client
44
+ replicate_api = replicate.Client(api_token=replicate_token)
45
+
46
  # Load the model and tokenizer once at the beginning
47
  def load_model(model_name):
48
+ global tokenizer, selected_model #model
49
  try:
50
  # Release the memory of the previous model if exists
51
+ #no corresponde para API
52
+ #if model is not None:
53
+ # del model
54
+ # torch.cuda.empty_cache()
55
 
56
  # Check if the model is in models or quantized_models and load accordingly
57
  if model_name in models:
 
73
  tokenizer.pad_token = tokenizer.eos_token
74
  tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
75
 
76
+ #model = AutoModelForCausalLM.from_pretrained(
77
+ # model_path,
78
+ # token=hugging_face_token,
79
+ # trust_remote_code=True
80
+ #)
81
+ #
82
+ ## Only move to CUDA if it's not a quantized model
83
+ #if model_name not in quantized_models:
84
+ # model = model.to("cuda")
85
+ #
86
  selected_model = model_name
87
  except Exception as e:
88
  print(f"Error loading model {model_name}: {e}")
89
  raise e
90
+ return tokenizer#, model
91
 
92
 
93
 
94
  # Ensure the initial model is loaded
95
+ #ahora solo load tokenizer
96
+ tokenizer = load_model(selected_model) #, model
97
 
98
  # Chat history
99
  chat_history = []
100
 
101
  # Function to handle interaction with model
102
+ #@spaces.GPU
103
  def interact(user_input, history, interaction_count, model_name):
104
  global tokenizer, model
105
  try:
106
  if tokenizer is None or model is None:
107
  raise ValueError("Tokenizer or model is not initialized.")
108
 
109
+ ## Determine the device to use (either CUDA if available, or CPU)
110
+ #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
111
+ #
112
+ ## Only move the model to the device if it's not a quantized model
113
+ #if model_name not in quantized_models:
114
+ # model = model.to(device)
115
 
116
  if interaction_count >= MAX_INTERACTIONS:
117
  user_input += ". Thank you for your questions. Our session is now over. Goodbye!"
 
126
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
127
 
128
  # Move input tensor to the correct device
129
+ #input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
130
+
131
+ # Generate request
132
+ inpt = {"prompt": prompt,
133
+ "max_new_tokens": 100,
134
+ "temperature": 0.1,
135
+ "prompt_template": "{prompt}",}
136
+ #"num_return_sequences": 1,
137
+ #"pad_token_id": tokenizer.eos_token_id}
138
+
139
+ #make request
140
+ response = replicate_api.run(
141
+ replicate_model[model],
142
+ input=inpt
143
+ )
144
+ response = "".join(response).strip()
145
+ #chat_history_ids = model.generate(input_ids, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id, temperature=0.1)
146
+ #response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
147
 
148
  # Update chat history with generated response
149
  history.append({"role": "user", "content": user_input})
config.py CHANGED
@@ -4,6 +4,7 @@ from oauth2client.service_account import ServiceAccountCredentials
4
 
5
  # Read the authentication token from the environment variable
6
  hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")
 
7
 
8
  # Google Sheets configuration
9
  def init_google_sheets_client():
@@ -15,17 +16,26 @@ def init_google_sheets_client():
15
  google_sheets_name = "Chatbot Test"
16
 
17
  # Define available models
18
- models = {
19
  "Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
20
  "Llama-2-7B-Chat": "meta-llama/Llama-2-7b-chat-hf",
21
- "Yi-6B-Chat": "01-ai/Yi-6B-Chat",
22
- "Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
23
  }
24
- # List of models fine-tuned in 4-bit or 8-bit
25
- quantized_models = {
26
- "Llama-3-8B-Finetuning-Stories": "rodrisouza/Llama-3-8B-Finetuning-Stories",
 
 
 
 
27
  }
28
 
 
 
 
 
 
29
  # Default model name
30
  default_model_name = "Meta-Llama-3-8B-Instruct"
31
 
 
4
 
5
  # Read the authentication token from the environment variable
6
  hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")
7
+ replicate_token = os.getenv("REPLICATE_TOKEN")
8
 
9
  # Google Sheets configuration
10
  def init_google_sheets_client():
 
16
  google_sheets_name = "Chatbot Test"
17
 
18
  # Define available models
19
+ huggingface_tokenizer = {
20
  "Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
21
  "Llama-2-7B-Chat": "meta-llama/Llama-2-7b-chat-hf",
22
+ #"Yi-6B-Chat": "01-ai/Yi-6B-Chat",
23
+ #"Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
24
  }
25
+
26
+ #Avaiable models for replicate
27
+ replicate_model= {
28
+ "Meta-Llama-3-8B-Instruct": "meta/Meta-Llama-3-8B-Instruct",
29
+ "Llama-2-7B-Chat": "meta/Llama-2-7b-chat-hf",
30
+ #"Yi-6B-Chat": "01-ai/yi-34b-chat",
31
+ #"Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
32
  }
33
 
34
+ # List of models fine-tuned in 4-bit or 8-bit
35
+ #quantized_models = {
36
+ # "Llama-3-8B-Finetuning-Stories": "rodrisouza/Llama-3-8B-Finetuning-Stories",
37
+ #}
38
+
39
  # Default model name
40
  default_model_name = "Meta-Llama-3-8B-Instruct"
41
 
requirements.txt CHANGED
@@ -6,4 +6,5 @@ pandas
6
  gspread
7
  oauth2client
8
  accelerate
9
- bitsandbytes
 
 
6
  gspread
7
  oauth2client
8
  accelerate
9
+ bitsandbytes
10
+ replicate