SeaLLM-Chat

Running on Zero

App Files Files Community

phi commited on Oct 19, 2023

Commit

3709b60

•

1 Parent(s): 6ded56f

update

Browse files

Files changed (1) hide show

app.py +44 -54

app.py CHANGED Viewed

@@ -57,68 +57,29 @@ TODO:
 need to upload the model as hugginface/models/seal_13b_a
 # https://huggingface.co/docs/hub/spaces-overview#managing-secrets
 set
-MODEL_REPO_ID=hugginface/models/seal_13b_a
 # if persistent, then export the following
 HF_HOME=/data/.huggingface
-TRANSFORMERS_CACHE=/data/.huggingface
 MODEL_PATH=/data/.huggingface/seal-13b-chat-a
 HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
 # if not persistent
 MODEL_PATH=./seal-13b-chat-a
 HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
-# download will auto detect and get the most updated one
-if DOWNLOAD_SNAPSHOT:
-    print(f'Download from HF_MODEL_NAME={HF_MODEL_NAME} -> {MODEL_PATH}')
-    snapshot_download(HF_MODEL_NAME, local_dir=MODEL_PATH)
-elif not DEBUG:
-    assert os.path.exists(MODEL_PATH), f'{MODEL_PATH} not found and no snapshot download'
 """
 # ==============================
 print(f'DEBUG mode: {DEBUG}')
-if DTYPE == "bfloat16" and not DEBUG:
-    try:
-        compute_capability = torch.cuda.get_device_capability()
-        if compute_capability[0] < 8:
-            gpu_name = torch.cuda.get_device_name()
-            print(
-                "Bfloat16 is only supported on GPUs with compute capability "
-                f"of at least 8.0. Your {gpu_name} GPU has compute capability "
-                f"{compute_capability[0]}.{compute_capability[1]}. --> Move to FLOAT16")
-            DTYPE = "float16"
-    except Exception as e:
-        print(f'Unable to obtain compute_capability: {e}')
-# @@ constants ================
-if not DEBUG:
-    # vllm import
-    from vllm import LLM, SamplingParams
-    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
-    from vllm.engine.arg_utils import EngineArgs
-    from vllm.engine.llm_engine import LLMEngine
-    from vllm.outputs import RequestOutput
-    from vllm.sampling_params import SamplingParams
-    from vllm.utils import Counter
-    from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
-                            SequenceGroupMetadata, SequenceOutputs,
-                            SequenceStatus)
-    # ! reconfigure vllm to faster llama
-    from vllm.model_executor.model_loader import _MODEL_REGISTRY
-    from vllm.model_executor.models import LlamaForCausalLM
-    _MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
 def _detect_lang(text):
@@ -390,7 +351,6 @@ def llama_load_weights(
                 intermediate_size + shard_size * tensor_model_parallel_rank,
                 intermediate_size + shard_size * (tensor_model_parallel_rank + 1)
             )
-            # print(f'{name} {param.size()} | {g_offsets} | {u_offsets}')
             _loaded_weight = torch.cat(
                 [
                     loaded_weight[g_offsets[0]:g_offsets[1]],
@@ -420,7 +380,33 @@ def llama_load_weights(
 # Reassign LlamaForCausalLM.load_weights with llama_load_weights
 if not DEBUG:
-    LlamaForCausalLM.load_weights = llama_load_weights
 # ! ==================================================================
@@ -501,11 +487,11 @@ class ChatBot(gr.Chatbot):
         return x
-# gr.ChatInterface
 from gradio.components import Button
 from gradio.events import Dependency, EventListenerMethod
 def _setup_stop_events(
     self, event_triggers: list[EventListenerMethod], event_to_cancel: Dependency
 ) -> None:
@@ -571,13 +557,12 @@ def _setup_stop_events(
                 queue=False,
             )
 gr.ChatInterface._setup_stop_events = _setup_stop_events
 def chat_response(message, history, temperature: float, max_tokens: int, system_prompt: str = '') -> str:
     global llm
     assert llm is not None
     temperature = float(temperature)
     max_tokens = int(max_tokens)
     if system_prompt.strip() != '':
@@ -594,6 +579,7 @@ def chat_response(message, history, temperature: float, max_tokens: int, system_
 def vllm_abort(self: Any):
     scheduler = self.llm_engine.scheduler
     for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
         for seq_group in state_queue:
@@ -607,6 +593,7 @@ def vllm_abort(self: Any):
 # def _vllm_run_engine(self: LLM, use_tqdm: bool = False) -> Dict[str, RequestOutput]:
 def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
     # Initialize tqdm.
     if use_tqdm:
         num_requests = self.llm_engine.get_num_unfinished_requests()
@@ -654,6 +641,7 @@ def vllm_generate_stream(
         A list of `RequestOutput` objects containing the generated
         completions in the same order as the input prompts.
     """
     if prompts is None and prompt_token_ids is None:
         raise ValueError("Either prompts or prompt_token_ids must be "
                             "provided.")
@@ -750,6 +738,7 @@ def chat_response_stream_multiturn(
     frequency_penalty: float,
     system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ) -> str:
     """Build multi turn
     <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
     <bos>[INST] Prompt [/INST] Answer <eos>
@@ -837,7 +826,7 @@ This is a DAMO SeaL-13B chatbot assistant built by DAMO Academy, Alibaba Group.
 cite_markdown = """
-### Citation
 If you find our project useful, hope you can star our repo and cite our paper as follows:
 ```
 @article{damonlpsg2023seallm,
@@ -849,9 +838,8 @@ If you find our project useful, hope you can star our repo and cite our paper as
 """
 warning_markdown = """
-### Warning:
 <span style="color: red">The chatbot may produce inaccurate and harmful information about people, places, or facts.</span>
 <span style="color: red">We strongly advise against misuse of the chatbot to knowingly generate harmful or unethical content, \
 or content that violates locally applicable and international laws or regulations, including hate speech, violence, pornography, deception, etc!</span>
 """
@@ -893,11 +881,12 @@ def launch():
     ckpt_info = "None"
     print(
-        f'Launch config: {model_path=} / {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens} | {BLOCK_ZH=} '
         f'\n| STREAM_YIELD_MULTIPLE={STREAM_YIELD_MULTIPLE} '
         f'\n| frequence_penalty={frequence_penalty} '
         f'\n| temperature={temperature} '
         f'\n| hf_model_name={hf_model_name} '
         f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
         f'\nsys={SYSTEM_PROMPT_1}'
         f'\ndesc={model_desc}'
@@ -910,6 +899,8 @@ def launch():
     else:
         # ! load the model
         import vllm
         print(F'VLLM: {vllm.__version__}')
         if DOWNLOAD_SNAPSHOT:
@@ -962,7 +953,6 @@ def launch():
 def main():
-    # launch(parser.parse_args())
     launch()

 need to upload the model as hugginface/models/seal_13b_a
 # https://huggingface.co/docs/hub/spaces-overview#managing-secrets
 set
+HF_TOKEN=???
+TRANSFORMERS_CACHE=/data/.huggingface
 # if persistent, then export the following
 HF_HOME=/data/.huggingface
 MODEL_PATH=/data/.huggingface/seal-13b-chat-a
 HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
 # if not persistent
 MODEL_PATH=./seal-13b-chat-a
 HF_MODEL_NAME=DAMO-NLP-SG/seal-13b-chat-a
 """
 # ==============================
 print(f'DEBUG mode: {DEBUG}')
+# @@ constants ================
 def _detect_lang(text):
                 intermediate_size + shard_size * tensor_model_parallel_rank,
                 intermediate_size + shard_size * (tensor_model_parallel_rank + 1)
             )
             _loaded_weight = torch.cat(
                 [
                     loaded_weight[g_offsets[0]:g_offsets[1]],
 # Reassign LlamaForCausalLM.load_weights with llama_load_weights
 if not DEBUG:
+    # vllm import
+    # from vllm import LLM, SamplingParams
+    # ! reconfigure vllm to faster llama
+    try:
+        import vllm
+        from vllm.model_executor.model_loader import _MODEL_REGISTRY
+        from vllm.model_executor.models import LlamaForCausalLM
+        _MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
+        LlamaForCausalLM.load_weights = llama_load_weights
+        if DTYPE == "bfloat16":
+            try:
+                compute_capability = torch.cuda.get_device_capability()
+                if compute_capability[0] < 8:
+                    gpu_name = torch.cuda.get_device_name()
+                    print(
+                        "Bfloat16 is only supported on GPUs with compute capability "
+                        f"of at least 8.0. Your {gpu_name} GPU has compute capability "
+                        f"{compute_capability[0]}.{compute_capability[1]}. --> Move to FLOAT16")
+                    DTYPE = "float16"
+            except Exception as e:
+                print(f'Unable to obtain compute_capability: {e}')
+    except Exception as e:
+        print(f'Failing import and reconfigure VLLM: {str(e)}')
 # ! ==================================================================
         return x
 from gradio.components import Button
 from gradio.events import Dependency, EventListenerMethod
+# replace events so that submit button is disabled during generation, if stop_btn not found
+# this prevent weird behavior
 def _setup_stop_events(
     self, event_triggers: list[EventListenerMethod], event_to_cancel: Dependency
 ) -> None:
                 queue=False,
             )
 gr.ChatInterface._setup_stop_events = _setup_stop_events
 def chat_response(message, history, temperature: float, max_tokens: int, system_prompt: str = '') -> str:
     global llm
     assert llm is not None
+    from vllm import LLM, SamplingParams
     temperature = float(temperature)
     max_tokens = int(max_tokens)
     if system_prompt.strip() != '':
 def vllm_abort(self: Any):
+    from vllm.sequence import SequenceStatus
     scheduler = self.llm_engine.scheduler
     for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
         for seq_group in state_queue:
 # def _vllm_run_engine(self: LLM, use_tqdm: bool = False) -> Dict[str, RequestOutput]:
 def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
+    from vllm.outputs import RequestOutput
     # Initialize tqdm.
     if use_tqdm:
         num_requests = self.llm_engine.get_num_unfinished_requests()
         A list of `RequestOutput` objects containing the generated
         completions in the same order as the input prompts.
     """
+    from vllm import LLM, SamplingParams
     if prompts is None and prompt_token_ids is None:
         raise ValueError("Either prompts or prompt_token_ids must be "
                             "provided.")
     frequency_penalty: float,
     system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ) -> str:
+    from vllm import LLM, SamplingParams
     """Build multi turn
     <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
     <bos>[INST] Prompt [/INST] Answer <eos>
 cite_markdown = """
+## Citation
 If you find our project useful, hope you can star our repo and cite our paper as follows:
 ```
 @article{damonlpsg2023seallm,
 """
 warning_markdown = """
+## Warning:
 <span style="color: red">The chatbot may produce inaccurate and harmful information about people, places, or facts.</span>
 <span style="color: red">We strongly advise against misuse of the chatbot to knowingly generate harmful or unethical content, \
 or content that violates locally applicable and international laws or regulations, including hate speech, violence, pornography, deception, etc!</span>
 """
     ckpt_info = "None"
     print(
+        f'Launch config: {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens} | {BLOCK_ZH=} '
         f'\n| STREAM_YIELD_MULTIPLE={STREAM_YIELD_MULTIPLE} '
         f'\n| frequence_penalty={frequence_penalty} '
         f'\n| temperature={temperature} '
         f'\n| hf_model_name={hf_model_name} '
+        f'\n| model_path={model_path} '
         f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
         f'\nsys={SYSTEM_PROMPT_1}'
         f'\ndesc={model_desc}'
     else:
         # ! load the model
         import vllm
+        from vllm import LLM, SamplingParams
         print(F'VLLM: {vllm.__version__}')
         if DOWNLOAD_SNAPSHOT:
 def main():
     launch()