Spaces:
Sleeping
Sleeping
ignacio
commited on
Commit
•
6830e68
1
Parent(s):
d6077fd
v1 solo api
Browse files- app.py +48 -27
- config.py +16 -6
- requirements.txt +2 -1
app.py
CHANGED
@@ -4,8 +4,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
4 |
import pandas as pd
|
5 |
from datetime import datetime, timedelta, timezone
|
6 |
import torch
|
7 |
-
from config import hugging_face_token, init_google_sheets_client, models, quantized_models, default_model_name, user_names, google_sheets_name, MAX_INTERACTIONS
|
8 |
import spaces
|
|
|
9 |
|
10 |
# Hack for ZeroGPU
|
11 |
torch.jit.script = lambda f: f
|
@@ -39,14 +40,18 @@ tokenizer, model = None, None
|
|
39 |
# Initialize the data list
|
40 |
data = []
|
41 |
|
|
|
|
|
|
|
42 |
# Load the model and tokenizer once at the beginning
|
43 |
def load_model(model_name):
|
44 |
-
global tokenizer, model
|
45 |
try:
|
46 |
# Release the memory of the previous model if exists
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
50 |
|
51 |
# Check if the model is in models or quantized_models and load accordingly
|
52 |
if model_name in models:
|
@@ -68,44 +73,45 @@ def load_model(model_name):
|
|
68 |
tokenizer.pad_token = tokenizer.eos_token
|
69 |
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
70 |
|
71 |
-
model = AutoModelForCausalLM.from_pretrained(
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
)
|
76 |
-
|
77 |
-
|
78 |
-
if model_name not in quantized_models:
|
79 |
-
|
80 |
-
|
81 |
selected_model = model_name
|
82 |
except Exception as e:
|
83 |
print(f"Error loading model {model_name}: {e}")
|
84 |
raise e
|
85 |
-
return tokenizer
|
86 |
|
87 |
|
88 |
|
89 |
# Ensure the initial model is loaded
|
90 |
-
|
|
|
91 |
|
92 |
# Chat history
|
93 |
chat_history = []
|
94 |
|
95 |
# Function to handle interaction with model
|
96 |
-
|
97 |
def interact(user_input, history, interaction_count, model_name):
|
98 |
global tokenizer, model
|
99 |
try:
|
100 |
if tokenizer is None or model is None:
|
101 |
raise ValueError("Tokenizer or model is not initialized.")
|
102 |
|
103 |
-
|
104 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
105 |
-
|
106 |
-
|
107 |
-
if model_name not in quantized_models:
|
108 |
-
|
109 |
|
110 |
if interaction_count >= MAX_INTERACTIONS:
|
111 |
user_input += ". Thank you for your questions. Our session is now over. Goodbye!"
|
@@ -120,9 +126,24 @@ def interact(user_input, history, interaction_count, model_name):
|
|
120 |
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
121 |
|
122 |
# Move input tensor to the correct device
|
123 |
-
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
# Update chat history with generated response
|
128 |
history.append({"role": "user", "content": user_input})
|
|
|
4 |
import pandas as pd
|
5 |
from datetime import datetime, timedelta, timezone
|
6 |
import torch
|
7 |
+
from config import hugging_face_token, replicate_token, init_google_sheets_client, models, replicate_model, quantized_models, default_model_name, user_names, google_sheets_name, MAX_INTERACTIONS
|
8 |
import spaces
|
9 |
+
import replicate
|
10 |
|
11 |
# Hack for ZeroGPU
|
12 |
torch.jit.script = lambda f: f
|
|
|
40 |
# Initialize the data list
|
41 |
data = []
|
42 |
|
43 |
+
#Initialize replicate client
|
44 |
+
replicate_api = replicate.Client(api_token=replicate_token)
|
45 |
+
|
46 |
# Load the model and tokenizer once at the beginning
|
47 |
def load_model(model_name):
|
48 |
+
global tokenizer, selected_model #model
|
49 |
try:
|
50 |
# Release the memory of the previous model if exists
|
51 |
+
#no corresponde para API
|
52 |
+
#if model is not None:
|
53 |
+
# del model
|
54 |
+
# torch.cuda.empty_cache()
|
55 |
|
56 |
# Check if the model is in models or quantized_models and load accordingly
|
57 |
if model_name in models:
|
|
|
73 |
tokenizer.pad_token = tokenizer.eos_token
|
74 |
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
75 |
|
76 |
+
#model = AutoModelForCausalLM.from_pretrained(
|
77 |
+
# model_path,
|
78 |
+
# token=hugging_face_token,
|
79 |
+
# trust_remote_code=True
|
80 |
+
#)
|
81 |
+
#
|
82 |
+
## Only move to CUDA if it's not a quantized model
|
83 |
+
#if model_name not in quantized_models:
|
84 |
+
# model = model.to("cuda")
|
85 |
+
#
|
86 |
selected_model = model_name
|
87 |
except Exception as e:
|
88 |
print(f"Error loading model {model_name}: {e}")
|
89 |
raise e
|
90 |
+
return tokenizer#, model
|
91 |
|
92 |
|
93 |
|
94 |
# Ensure the initial model is loaded
|
95 |
+
#ahora solo load tokenizer
|
96 |
+
tokenizer = load_model(selected_model) #, model
|
97 |
|
98 |
# Chat history
|
99 |
chat_history = []
|
100 |
|
101 |
# Function to handle interaction with model
|
102 |
+
#@spaces.GPU
|
103 |
def interact(user_input, history, interaction_count, model_name):
|
104 |
global tokenizer, model
|
105 |
try:
|
106 |
if tokenizer is None or model is None:
|
107 |
raise ValueError("Tokenizer or model is not initialized.")
|
108 |
|
109 |
+
## Determine the device to use (either CUDA if available, or CPU)
|
110 |
+
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
111 |
+
#
|
112 |
+
## Only move the model to the device if it's not a quantized model
|
113 |
+
#if model_name not in quantized_models:
|
114 |
+
# model = model.to(device)
|
115 |
|
116 |
if interaction_count >= MAX_INTERACTIONS:
|
117 |
user_input += ". Thank you for your questions. Our session is now over. Goodbye!"
|
|
|
126 |
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
127 |
|
128 |
# Move input tensor to the correct device
|
129 |
+
#input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
|
130 |
+
|
131 |
+
# Generate request
|
132 |
+
inpt = {"prompt": prompt,
|
133 |
+
"max_new_tokens": 100,
|
134 |
+
"temperature": 0.1,
|
135 |
+
"prompt_template": "{prompt}",}
|
136 |
+
#"num_return_sequences": 1,
|
137 |
+
#"pad_token_id": tokenizer.eos_token_id}
|
138 |
+
|
139 |
+
#make request
|
140 |
+
response = replicate_api.run(
|
141 |
+
replicate_model[model],
|
142 |
+
input=inpt
|
143 |
+
)
|
144 |
+
response = "".join(response).strip()
|
145 |
+
#chat_history_ids = model.generate(input_ids, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id, temperature=0.1)
|
146 |
+
#response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
|
147 |
|
148 |
# Update chat history with generated response
|
149 |
history.append({"role": "user", "content": user_input})
|
config.py
CHANGED
@@ -4,6 +4,7 @@ from oauth2client.service_account import ServiceAccountCredentials
|
|
4 |
|
5 |
# Read the authentication token from the environment variable
|
6 |
hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")
|
|
|
7 |
|
8 |
# Google Sheets configuration
|
9 |
def init_google_sheets_client():
|
@@ -15,17 +16,26 @@ def init_google_sheets_client():
|
|
15 |
google_sheets_name = "Chatbot Test"
|
16 |
|
17 |
# Define available models
|
18 |
-
|
19 |
"Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
|
20 |
"Llama-2-7B-Chat": "meta-llama/Llama-2-7b-chat-hf",
|
21 |
-
"Yi-6B-Chat": "01-ai/Yi-6B-Chat",
|
22 |
-
"Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
|
23 |
}
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
27 |
}
|
28 |
|
|
|
|
|
|
|
|
|
|
|
29 |
# Default model name
|
30 |
default_model_name = "Meta-Llama-3-8B-Instruct"
|
31 |
|
|
|
4 |
|
5 |
# Read the authentication token from the environment variable
|
6 |
hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")
|
7 |
+
replicate_token = os.getenv("REPLICATE_TOKEN")
|
8 |
|
9 |
# Google Sheets configuration
|
10 |
def init_google_sheets_client():
|
|
|
16 |
google_sheets_name = "Chatbot Test"
|
17 |
|
18 |
# Define available models
|
19 |
+
huggingface_tokenizer = {
|
20 |
"Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
|
21 |
"Llama-2-7B-Chat": "meta-llama/Llama-2-7b-chat-hf",
|
22 |
+
#"Yi-6B-Chat": "01-ai/Yi-6B-Chat",
|
23 |
+
#"Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
|
24 |
}
|
25 |
+
|
26 |
+
#Avaiable models for replicate
|
27 |
+
replicate_model= {
|
28 |
+
"Meta-Llama-3-8B-Instruct": "meta/Meta-Llama-3-8B-Instruct",
|
29 |
+
"Llama-2-7B-Chat": "meta/Llama-2-7b-chat-hf",
|
30 |
+
#"Yi-6B-Chat": "01-ai/yi-34b-chat",
|
31 |
+
#"Qwen2-7B-Instruct": "Qwen/Qwen2-7B-Instruct"
|
32 |
}
|
33 |
|
34 |
+
# List of models fine-tuned in 4-bit or 8-bit
|
35 |
+
#quantized_models = {
|
36 |
+
# "Llama-3-8B-Finetuning-Stories": "rodrisouza/Llama-3-8B-Finetuning-Stories",
|
37 |
+
#}
|
38 |
+
|
39 |
# Default model name
|
40 |
default_model_name = "Meta-Llama-3-8B-Instruct"
|
41 |
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ pandas
|
|
6 |
gspread
|
7 |
oauth2client
|
8 |
accelerate
|
9 |
-
bitsandbytes
|
|
|
|
6 |
gspread
|
7 |
oauth2client
|
8 |
accelerate
|
9 |
+
bitsandbytes
|
10 |
+
replicate
|