Hjgugugjhuhjggg commited on
Commit
7c1d188
·
verified ·
1 Parent(s): cea60ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -13
app.py CHANGED
@@ -13,8 +13,7 @@ from botocore.exceptions import NoCredentialsError
13
  from functools import cached_property
14
  import base64
15
  from optimum.onnxruntime import ORTModelForCausalLM
16
- from optimum.bettertransformer import BetterTransformer
17
-
18
 
19
  AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
20
  AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
@@ -41,8 +40,8 @@ class GenerateRequest(BaseModel):
41
  num_return_sequences: int = 1
42
  do_sample: bool = False
43
  stop_sequences: list[str] = []
 
44
  use_onnx: bool = False
45
- use_bettertransformer: bool = True
46
  @field_validator("model_name")
47
  def model_name_cannot_be_empty(cls, v):
48
  if not v:
@@ -61,16 +60,19 @@ class S3ModelLoader:
61
  self.model_cache = {}
62
  def _get_s3_uri(self, model_name):
63
  return f"s3://{self.bucket_name}/{model_name.replace('/', '-')}"
64
- async def _load_model_and_tokenizer(self, model_name, use_onnx, use_bettertransformer):
65
  s3_uri = self._get_s3_uri(model_name)
66
  try:
67
  config = AutoConfig.from_pretrained(s3_uri, local_files_only=False)
68
  if use_onnx:
69
  model = ORTModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
 
 
 
 
 
70
  else:
71
  model = AutoModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
72
- if use_bettertransformer:
73
- model = BetterTransformer.transform(model)
74
  tokenizer = AutoTokenizer.from_pretrained(s3_uri, config=config, local_files_only=False)
75
  if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
76
  tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
@@ -81,10 +83,13 @@ class S3ModelLoader:
81
  tokenizer = AutoTokenizer.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN)
82
  if use_onnx:
83
  model = ORTModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
 
 
 
 
 
84
  else:
85
  model = AutoModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
86
- if use_bettertransformer:
87
- model = BetterTransformer.transform(model)
88
  if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
89
  tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
90
  return model, tokenizer
@@ -93,10 +98,10 @@ class S3ModelLoader:
93
  @cached_property
94
  def device(self):
95
  return torch.device("cpu")
96
- async def get_model_and_tokenizer(self, model_name, use_onnx, use_bettertransformer):
97
- key = f"{model_name}-{use_onnx}-{use_bettertransformer}"
98
  if key not in self.model_cache:
99
- model, tokenizer = await self._load_model_and_tokenizer(model_name, use_onnx, use_bettertransformer)
100
  self.model_cache[key] = {"model":model, "tokenizer":tokenizer}
101
  return self.model_cache[key]["model"], self.model_cache[key]["tokenizer"]
102
  async def get_pipeline(self, model_name, task_type):
@@ -122,9 +127,9 @@ async def generate(request: GenerateRequest):
122
  num_return_sequences = request.num_return_sequences
123
  do_sample = request.do_sample
124
  stop_sequences = request.stop_sequences
 
125
  use_onnx = request.use_onnx
126
- use_bettertransformer = request.use_bettertransformer
127
- model, tokenizer = await model_loader.get_model_and_tokenizer(model_name, use_onnx, use_bettertransformer)
128
  if "text-to-text" == task_type:
129
  generation_config = GenerationConfig(temperature=temperature,max_new_tokens=max_new_tokens,top_p=top_p,top_k=top_k,repetition_penalty=repetition_penalty,do_sample=do_sample,num_return_sequences=num_return_sequences,eos_token_id = tokenizer.eos_token_id)
130
  if stream:
 
13
  from functools import cached_property
14
  import base64
15
  from optimum.onnxruntime import ORTModelForCausalLM
16
+ import bitsandbytes as bnb
 
17
 
18
  AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
19
  AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
 
40
  num_return_sequences: int = 1
41
  do_sample: bool = False
42
  stop_sequences: list[str] = []
43
+ quantize: bool = True
44
  use_onnx: bool = False
 
45
  @field_validator("model_name")
46
  def model_name_cannot_be_empty(cls, v):
47
  if not v:
 
60
  self.model_cache = {}
61
  def _get_s3_uri(self, model_name):
62
  return f"s3://{self.bucket_name}/{model_name.replace('/', '-')}"
63
+ async def _load_model_and_tokenizer(self, model_name, quantize, use_onnx):
64
  s3_uri = self._get_s3_uri(model_name)
65
  try:
66
  config = AutoConfig.from_pretrained(s3_uri, local_files_only=False)
67
  if use_onnx:
68
  model = ORTModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
69
+ elif quantize:
70
+ model = AutoModelForCausalLM.from_pretrained(
71
+ s3_uri, config=config, local_files_only=False,
72
+ load_in_8bit=True
73
+ ).to(self.device)
74
  else:
75
  model = AutoModelForCausalLM.from_pretrained(s3_uri, config=config, local_files_only=False).to(self.device)
 
 
76
  tokenizer = AutoTokenizer.from_pretrained(s3_uri, config=config, local_files_only=False)
77
  if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
78
  tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
 
83
  tokenizer = AutoTokenizer.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN)
84
  if use_onnx:
85
  model = ORTModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
86
+ elif quantize:
87
+ model = AutoModelForCausalLM.from_pretrained(
88
+ model_name, config=config, token=HUGGINGFACE_HUB_TOKEN,
89
+ load_in_8bit=True
90
+ ).to(self.device)
91
  else:
92
  model = AutoModelForCausalLM.from_pretrained(model_name, config=config, token=HUGGINGFACE_HUB_TOKEN).to(self.device)
 
 
93
  if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
94
  tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
95
  return model, tokenizer
 
98
  @cached_property
99
  def device(self):
100
  return torch.device("cpu")
101
+ async def get_model_and_tokenizer(self, model_name, quantize, use_onnx):
102
+ key = f"{model_name}-{quantize}-{use_onnx}"
103
  if key not in self.model_cache:
104
+ model, tokenizer = await self._load_model_and_tokenizer(model_name, quantize, use_onnx)
105
  self.model_cache[key] = {"model":model, "tokenizer":tokenizer}
106
  return self.model_cache[key]["model"], self.model_cache[key]["tokenizer"]
107
  async def get_pipeline(self, model_name, task_type):
 
127
  num_return_sequences = request.num_return_sequences
128
  do_sample = request.do_sample
129
  stop_sequences = request.stop_sequences
130
+ quantize = request.quantize
131
  use_onnx = request.use_onnx
132
+ model, tokenizer = await model_loader.get_model_and_tokenizer(model_name, quantize, use_onnx)
 
133
  if "text-to-text" == task_type:
134
  generation_config = GenerationConfig(temperature=temperature,max_new_tokens=max_new_tokens,top_p=top_p,top_k=top_k,repetition_penalty=repetition_penalty,do_sample=do_sample,num_return_sequences=num_return_sequences,eos_token_id = tokenizer.eos_token_id)
135
  if stream: