Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,8 +13,7 @@ from botocore.exceptions import NoCredentialsError
|
|
13 |
from functools import cached_property
|
14 |
import base64
|
15 |
from optimum.onnxruntime import ORTModelForCausalLM
|
16 |
-
|
17 |
-
|
18 |
|
19 |
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
|
20 |
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
|
@@ -41,8 +40,8 @@ class GenerateRequest(BaseModel):
|
|
41 |
num_return_sequences: int = 1
|
42 |
do_sample: bool = False
|
43 |
stop_sequences: list[str] = []
|
|
|
44 |
use_onnx: bool = False
|
45 |
-
use_bettertransformer: bool = True
|
46 |
@field_validator("model_name")
|
47 |
def model_name_cannot_be_empty(cls, v):
|
48 |
if not v:
|
@@ -61,16 +60,19 @@ class S3ModelLoader:
|
|
61 |
self.model_cache = {}
|
62 |
def _get_s3_uri(self, model_name):
|
63 |
return f"s3://{self.bucket_name}/{model_name.replace('/', '-')}"
|
64 |
-
async def _load_model_and_tokenizer(self, model_name,
|
65 |
s3_uri = self._get_s3_uri(model_name)
|
66 |
try:
|
67 |
config = AutoConfig.from_pretrained(s3_uri, local_files_only=False)
|
68 |
if use_onnx:
|
69 |
model = ORTModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
|
|
|
|
|
|
|
|
|
|
|
70 |
else:
|
71 |
model = AutoModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
|
72 |
-
if use_bettertransformer:
|
73 |
-
model = BetterTransformer.transform(model)
|
74 |
tokenizer = AutoTokenizer.from_pretrained(s3_uri, config=config, local_files_only=False)
|
75 |
if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
|
76 |
tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
|
@@ -81,10 +83,13 @@ class S3ModelLoader:
|
|
81 |
tokenizer = AutoTokenizer.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN)
|
82 |
if use_onnx:
|
83 |
model = ORTModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
|
|
|
|
|
|
|
|
|
|
|
84 |
else:
|
85 |
model = AutoModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
|
86 |
-
if use_bettertransformer:
|
87 |
-
model = BetterTransformer.transform(model)
|
88 |
if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
|
89 |
tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
|
90 |
return model, tokenizer
|
@@ -93,10 +98,10 @@ class S3ModelLoader:
|
|
93 |
@cached_property
|
94 |
def device(self):
|
95 |
return torch.device("cpu")
|
96 |
-
async def get_model_and_tokenizer(self, model_name,
|
97 |
-
key = f"{model_name}-{
|
98 |
if key not in self.model_cache:
|
99 |
-
model, tokenizer = await self._load_model_and_tokenizer(model_name,
|
100 |
self.model_cache[key] = {"model":model, "tokenizer":tokenizer}
|
101 |
return self.model_cache[key]["model"], self.model_cache[key]["tokenizer"]
|
102 |
async def get_pipeline(self, model_name, task_type):
|
@@ -122,9 +127,9 @@ async def generate(request: GenerateRequest):
|
|
122 |
num_return_sequences = request.num_return_sequences
|
123 |
do_sample = request.do_sample
|
124 |
stop_sequences = request.stop_sequences
|
|
|
125 |
use_onnx = request.use_onnx
|
126 |
-
|
127 |
-
model, tokenizer = await model_loader.get_model_and_tokenizer(model_name, use_onnx, use_bettertransformer)
|
128 |
if "text-to-text" == task_type:
|
129 |
generation_config = GenerationConfig(temperature=temperature,max_new_tokens=max_new_tokens,top_p=top_p,top_k=top_k,repetition_penalty=repetition_penalty,do_sample=do_sample,num_return_sequences=num_return_sequences,eos_token_id = tokenizer.eos_token_id)
|
130 |
if stream:
|
|
|
13 |
from functools import cached_property
|
14 |
import base64
|
15 |
from optimum.onnxruntime import ORTModelForCausalLM
|
16 |
+
import bitsandbytes as bnb
|
|
|
17 |
|
18 |
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
|
19 |
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
|
|
|
40 |
num_return_sequences: int = 1
|
41 |
do_sample: bool = False
|
42 |
stop_sequences: list[str] = []
|
43 |
+
quantize: bool = True
|
44 |
use_onnx: bool = False
|
|
|
45 |
@field_validator("model_name")
|
46 |
def model_name_cannot_be_empty(cls, v):
|
47 |
if not v:
|
|
|
60 |
self.model_cache = {}
|
61 |
def _get_s3_uri(self, model_name):
|
62 |
return f"s3://{self.bucket_name}/{model_name.replace('/', '-')}"
|
63 |
+
async def _load_model_and_tokenizer(self, model_name, quantize, use_onnx):
|
64 |
s3_uri = self._get_s3_uri(model_name)
|
65 |
try:
|
66 |
config = AutoConfig.from_pretrained(s3_uri, local_files_only=False)
|
67 |
if use_onnx:
|
68 |
model = ORTModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
|
69 |
+
elif quantize:
|
70 |
+
model = AutoModelForCausalLM.from_pretrained(
|
71 |
+
s3_uri, config=config, local_files_only=False,
|
72 |
+
load_in_8bit=True
|
73 |
+
).to(self.device)
|
74 |
else:
|
75 |
model = AutoModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
|
|
|
|
|
76 |
tokenizer = AutoTokenizer.from_pretrained(s3_uri, config=config, local_files_only=False)
|
77 |
if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
|
78 |
tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
|
|
|
83 |
tokenizer = AutoTokenizer.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN)
|
84 |
if use_onnx:
|
85 |
model = ORTModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
|
86 |
+
elif quantize:
|
87 |
+
model = AutoModelForCausalLM.from_pretrained(
|
88 |
+
model_name, config=config, token=HUGGINGFACE_HUB_TOKEN,
|
89 |
+
load_in_8bit=True
|
90 |
+
).to(self.device)
|
91 |
else:
|
92 |
model = AutoModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
|
|
|
|
|
93 |
if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
|
94 |
tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
|
95 |
return model, tokenizer
|
|
|
98 |
@cached_property
|
99 |
def device(self):
|
100 |
return torch.device("cpu")
|
101 |
+
async def get_model_and_tokenizer(self, model_name, quantize, use_onnx):
|
102 |
+
key = f"{model_name}-{quantize}-{use_onnx}"
|
103 |
if key not in self.model_cache:
|
104 |
+
model, tokenizer = await self._load_model_and_tokenizer(model_name, quantize, use_onnx)
|
105 |
self.model_cache[key] = {"model":model, "tokenizer":tokenizer}
|
106 |
return self.model_cache[key]["model"], self.model_cache[key]["tokenizer"]
|
107 |
async def get_pipeline(self, model_name, task_type):
|
|
|
127 |
num_return_sequences = request.num_return_sequences
|
128 |
do_sample = request.do_sample
|
129 |
stop_sequences = request.stop_sequences
|
130 |
+
quantize = request.quantize
|
131 |
use_onnx = request.use_onnx
|
132 |
+
model, tokenizer = await model_loader.get_model_and_tokenizer(model_name, quantize, use_onnx)
|
|
|
133 |
if "text-to-text" == task_type:
|
134 |
generation_config = GenerationConfig(temperature=temperature,max_new_tokens=max_new_tokens,top_p=top_p,top_k=top_k,repetition_penalty=repetition_penalty,do_sample=do_sample,num_return_sequences=num_return_sequences,eos_token_id = tokenizer.eos_token_id)
|
135 |
if stream:
|