phi commited on
Commit
6ded56f
1 Parent(s): a572fd2
Files changed (1) hide show
  1. app.py +103 -29
app.py CHANGED
@@ -32,17 +32,72 @@ from huggingface_hub import snapshot_download
32
  # @@ constants ================
33
 
34
  DEBUG = bool(int(os.environ.get("DEBUG", "1")))
35
- BLOCK_ZH = bool(int(os.environ.get("BLOCK_ZH", "0")))
36
-
37
  TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
38
  DTYPE = os.environ.get("DTYPE", "bfloat16")
39
- # DTYPE = 'float16'
40
 
41
- # MODEL_PATH = os.environ.get("MODEL_PATH", "notfound, please set `export MODEL_PATH=`")
42
- MODEL_PATH = os.environ.get("MODEL_PATH", "seal_13b_a")
 
 
 
 
 
 
 
43
  PORT = int(os.environ.get("PORT", "7860"))
44
  STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
45
- MAX_TOKENS = 2048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # @@ constants ================
48
  if not DEBUG:
@@ -115,7 +170,6 @@ def hf_model_weights_iterator(
115
  x for x in glob.glob(os.path.join(hf_folder, "*model*.safetensors"))
116
  if not x.endswith("training_args.bin")
117
  ]
118
- # print(F'Load bin files: {hf_bin_files} // safetensors: {hf_safetensors_files}')
119
 
120
  if use_np_cache:
121
  # Convert the model weights from torch tensors to numpy arrays for
@@ -226,15 +280,8 @@ def llama_load_weights(
226
  state_dict = self.state_dict()
227
  need_to_load = len(state_dict)
228
  loaded = 0
229
- # try:
230
- # iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
231
- # except Exception as e:
232
- # iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, load_format, revision)
233
  iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
234
 
235
- # for name, loaded_weight in hf_model_weights_iterator(
236
- # model_name_or_path, cache_dir, load_format, revision):
237
- # model_name_or_path, cache_dir, use_np_cache):
238
  for name, loaded_weight in iterator:
239
  if "rotary_emb.inv_freq" in name:
240
  continue
@@ -253,12 +300,6 @@ def llama_load_weights(
253
  if num_extra_rows > 0:
254
  print(f'Add empty to {num_extra_rows} extra row for {name}')
255
  print(f'Load: {name} | {padded_vocab_size=} | {self.config.vocab_size=} | {num_extra_rows=} | {param.size()=} | {loaded_weight.size()=} | {load_size=}')
256
-
257
- # if "embed_tokens" in name or "lm_head" in name:
258
- # param = state_dict[name]
259
- # load_padded_tensor_parallel_vocab(param, loaded_weight, tensor_model_parallel_rank)
260
- # loaded += 1
261
- # continue
262
 
263
  is_attention_weight = False
264
  for weight_name, shard_size, offset in attention_weight_specs:
@@ -385,8 +426,6 @@ if not DEBUG:
385
 
386
  set_documentation_group("component")
387
 
388
- DATA_ROOT = os.environ.get("dataroot", "/mnt/workspace/workgroup/phi")
389
- MODEL_CACHE_DIR = os.path.join(DATA_ROOT, "pret_models")
390
 
391
 
392
  DTYPES = {
@@ -397,7 +436,6 @@ DTYPES = {
397
  llm = None
398
  demo = None
399
 
400
- RELOAD_SIGNAL = '<<<reload:'
401
 
402
  BOS_TOKEN = '<s>'
403
  EOS_TOKEN = '</s>'
@@ -824,28 +862,64 @@ path_markdown = """
824
  {model_path}
825
  """
826
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
827
 
828
  def launch():
829
  global demo, llm, DEBUG
830
  model_desc = MODEL_DESC
831
  model_path = MODEL_PATH
832
  model_title = MODEL_TITLE
 
833
  tensor_parallel = TENSOR_PARALLEL
834
  assert tensor_parallel > 0 , f'{tensor_parallel} invalid'
835
  dtype = DTYPE
836
  sys_prompt = SYSTEM_PROMPT_1
837
  max_tokens = MAX_TOKENS
838
- print(f'Launch config: {model_path=} / {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens}\n{SYSTEM_PROMPT_1} | {BLOCK_ZH=}')
 
 
 
 
 
 
 
 
 
 
 
 
 
839
 
840
  if DEBUG:
841
- model_desc += "\n<br>!!!!! This is in debug mode, responses will be copy original"
842
  response_fn = debug_chat_response_echo
 
843
  else:
844
  # ! load the model
845
  import vllm
846
- assert os.path.exists(model_path), f'{model_path} not found'
847
  print(F'VLLM: {vllm.__version__}')
848
- print(f'Load path: {model_path}')
 
 
 
 
 
 
 
 
849
  llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel)
850
 
851
  print(f'Use system prompt:\n{sys_prompt}')
@@ -871,9 +945,9 @@ def launch():
871
  description=f"{model_desc}",
872
  # ! decide if can change the system prompt.
873
  additional_inputs=[
874
- gr.Number(value=0, label='Temperature (higher -> more random)'),
875
  gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
876
- gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
877
  # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
878
  ],
879
  )
 
32
  # @@ constants ================
33
 
34
  DEBUG = bool(int(os.environ.get("DEBUG", "1")))
35
+ BLOCK_ZH = bool(int(os.environ.get("BLOCK_ZH", "1")))
 
36
  TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
37
  DTYPE = os.environ.get("DTYPE", "bfloat16")
 
38
 
39
+ # ! (no debug) whether to download HF_MODEL_NAME and save to MODEL_PATH
40
+ DOWNLOAD_SNAPSHOT = bool(int(os.environ.get("DOWNLOAD_SNAPSHOT", "0")))
41
+ # ! uploaded model path, will be downloaded to MODEL_PATH
42
+ HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "DAMO-NLP-SG/seal-13b-chat-a")
43
+ MODEL_PATH = os.environ.get("MODEL_PATH", "./seal-13b-chat-a")
44
+
45
+
46
+
47
+ # gradio config
48
  PORT = int(os.environ.get("PORT", "7860"))
49
  STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
50
+ MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
51
+ TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
52
+ FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.4"))
53
+
54
+
55
+ """
56
+ TODO:
57
+ need to upload the model as hugginface/models/seal_13b_a
58
+ # https://huggingface.co/docs/hub/spaces-overview#managing-secrets
59
+ set
60
+ MODEL_REPO_ID=hugginface/models/seal_13b_a
61
+
62
+ # if persistent, then export the following
63
+ HF_HOME=/data/.huggingface
64
+ TRANSFORMERS_CACHE=/data/.huggingface
65
+ MODEL_PATH=/data/.huggingface/seal-13b-chat-a
66
+ HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
67
+ # if not persistent
68
+ MODEL_PATH=./seal-13b-chat-a
69
+ HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
70
+
71
+
72
+
73
+ # download will auto detect and get the most updated one
74
+ if DOWNLOAD_SNAPSHOT:
75
+ print(f'Download from HF_MODEL_NAME={HF_MODEL_NAME} -> {MODEL_PATH}')
76
+ snapshot_download(HF_MODEL_NAME, local_dir=MODEL_PATH)
77
+ elif not DEBUG:
78
+ assert os.path.exists(MODEL_PATH), f'{MODEL_PATH} not found and no snapshot download'
79
+
80
+ """
81
+
82
+
83
+
84
+
85
+ # ==============================
86
+ print(f'DEBUG mode: {DEBUG}')
87
+
88
+ if DTYPE == "bfloat16" and not DEBUG:
89
+ try:
90
+ compute_capability = torch.cuda.get_device_capability()
91
+ if compute_capability[0] < 8:
92
+ gpu_name = torch.cuda.get_device_name()
93
+ print(
94
+ "Bfloat16 is only supported on GPUs with compute capability "
95
+ f"of at least 8.0. Your {gpu_name} GPU has compute capability "
96
+ f"{compute_capability[0]}.{compute_capability[1]}. --> Move to FLOAT16")
97
+ DTYPE = "float16"
98
+ except Exception as e:
99
+ print(f'Unable to obtain compute_capability: {e}')
100
+
101
 
102
  # @@ constants ================
103
  if not DEBUG:
 
170
  x for x in glob.glob(os.path.join(hf_folder, "*model*.safetensors"))
171
  if not x.endswith("training_args.bin")
172
  ]
 
173
 
174
  if use_np_cache:
175
  # Convert the model weights from torch tensors to numpy arrays for
 
280
  state_dict = self.state_dict()
281
  need_to_load = len(state_dict)
282
  loaded = 0
 
 
 
 
283
  iterator = hf_model_weights_iterator(model_name_or_path, cache_dir, use_np_cache)
284
 
 
 
 
285
  for name, loaded_weight in iterator:
286
  if "rotary_emb.inv_freq" in name:
287
  continue
 
300
  if num_extra_rows > 0:
301
  print(f'Add empty to {num_extra_rows} extra row for {name}')
302
  print(f'Load: {name} | {padded_vocab_size=} | {self.config.vocab_size=} | {num_extra_rows=} | {param.size()=} | {loaded_weight.size()=} | {load_size=}')
 
 
 
 
 
 
303
 
304
  is_attention_weight = False
305
  for weight_name, shard_size, offset in attention_weight_specs:
 
426
 
427
  set_documentation_group("component")
428
 
 
 
429
 
430
 
431
  DTYPES = {
 
436
  llm = None
437
  demo = None
438
 
 
439
 
440
  BOS_TOKEN = '<s>'
441
  EOS_TOKEN = '</s>'
 
862
  {model_path}
863
  """
864
 
865
+ def check_model_path(model_path) -> str:
866
+ assert os.path.exists(model_path), f'{model_path} not found'
867
+ ckpt_info = "None"
868
+ if os.path.isdir(model_path):
869
+ if os.path.exists(f'{model_path}/info.txt'):
870
+ with open(f'{model_path}/info.txt', 'r') as f:
871
+ ckpt_info = f.read()
872
+ print(f'Checkpoint info:\n{ckpt_info}\n-----')
873
+ else:
874
+ print(f'info.txt not found in {model_path}')
875
+ print(f'model path dir: {list(os.listdir(model_path))}')
876
+
877
+ return ckpt_info
878
+
879
 
880
  def launch():
881
  global demo, llm, DEBUG
882
  model_desc = MODEL_DESC
883
  model_path = MODEL_PATH
884
  model_title = MODEL_TITLE
885
+ hf_model_name = HF_MODEL_NAME
886
  tensor_parallel = TENSOR_PARALLEL
887
  assert tensor_parallel > 0 , f'{tensor_parallel} invalid'
888
  dtype = DTYPE
889
  sys_prompt = SYSTEM_PROMPT_1
890
  max_tokens = MAX_TOKENS
891
+ temperature = TEMPERATURE
892
+ frequence_penalty = FREQUENCE_PENALTY
893
+ ckpt_info = "None"
894
+
895
+ print(
896
+ f'Launch config: {model_path=} / {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens} | {BLOCK_ZH=} '
897
+ f'\n| STREAM_YIELD_MULTIPLE={STREAM_YIELD_MULTIPLE} '
898
+ f'\n| frequence_penalty={frequence_penalty} '
899
+ f'\n| temperature={temperature} '
900
+ f'\n| hf_model_name={hf_model_name} '
901
+ f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
902
+ f'\nsys={SYSTEM_PROMPT_1}'
903
+ f'\ndesc={model_desc}'
904
+ )
905
 
906
  if DEBUG:
907
+ model_desc += "\n<br>!!!!! This is in debug mode, responses will copy original"
908
  response_fn = debug_chat_response_echo
909
+ print(f'Creating in DEBUG MODE')
910
  else:
911
  # ! load the model
912
  import vllm
 
913
  print(F'VLLM: {vllm.__version__}')
914
+
915
+ if DOWNLOAD_SNAPSHOT:
916
+ print(f'Downloading from HF_MODEL_NAME={hf_model_name} -> {model_path}')
917
+ snapshot_download(hf_model_name, local_dir=model_path)
918
+
919
+ assert os.path.exists(model_path), f'{model_path} not found and no snapshot download'
920
+ ckpt_info = check_model_path(model_path)
921
+
922
+ print(f'Load path: {model_path} | {ckpt_info}')
923
  llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel)
924
 
925
  print(f'Use system prompt:\n{sys_prompt}')
 
945
  description=f"{model_desc}",
946
  # ! decide if can change the system prompt.
947
  additional_inputs=[
948
+ gr.Number(value=temperature, label='Temperature (higher -> more random)'),
949
  gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
950
+ gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens)'),
951
  # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
952
  ],
953
  )