SeaLLM-Chat

Running on Zero

App Files Files Community

phi commited on Oct 19, 2023

Commit

a572fd2

•

1 Parent(s): 5100e68

change files

Browse files

Files changed (2) hide show

app.py +254 -133
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -28,10 +28,25 @@ from typing import List, Optional, Union, Dict, Tuple
 from tqdm.auto import tqdm
 from huggingface_hub import snapshot_download
-DEBUG = True
-if not DEBUG:
     # vllm import
     from vllm import LLM, SamplingParams
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -51,6 +66,22 @@ if not DEBUG:
     _MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
 def hf_model_weights_iterator(
     model_name_or_path: str,
     cache_dir: Optional[str] = None,
@@ -208,26 +239,26 @@ def llama_load_weights(
         if "rotary_emb.inv_freq" in name:
             continue
-        # if "embed_tokens" in name or "lm_head" in name:
-        #     param = state_dict[name]
-        #     # Consider padding in the vocab size.
-        #     padded_vocab_size = (param.shape[0] * tp_size)
-        #     # num_extra_rows = padded_vocab_size - self.config.vocab_size
-        #     num_extra_rows = padded_vocab_size - loaded_weight.size(0)
-        #     load_size = loaded_weight.size()
-        #     extra_rows = torch.empty(num_extra_rows,
-        #                                 loaded_weight.shape[1])
-        #     extra_rows = extra_rows.to(loaded_weight)
-        #     loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
-        #     if num_extra_rows > 0:
-        #         print(f'Add empty to {num_extra_rows} extra row for {name}')
-        #     print(f'Load: {name} | {padded_vocab_size=} | {self.config.vocab_size=} | {num_extra_rows=} | {param.size()=} | {loaded_weight.size()=} | {load_size=}')
         if "embed_tokens" in name or "lm_head" in name:
             param = state_dict[name]
-            load_padded_tensor_parallel_vocab(param, loaded_weight, tensor_model_parallel_rank)
-            loaded += 1
-            continue
         is_attention_weight = False
         for weight_name, shard_size, offset in attention_weight_specs:
@@ -428,29 +459,84 @@ class ChatBot(gr.Chatbot):
     ):
         x = super()._postprocess_chat_messages(chat_message)
         if isinstance(x, str):
-            x = x.replace("\n", "<br>")
         return x
-def load_ckpt(ckpt_file: str) -> str:
-    global llm
-    status = "Failed"
-    if not os.path.exists(ckpt_file):
-        status = f"Failed - file not found: {ckpt_file}"
-    elif not ckpt_file.endswith(".bin"):
-        status = f"Failed - file not .bin: {ckpt_file}"
     else:
-        try:
-            state_dict = torch.load(ckpt_file, map_location='cpu')
-            print(f'loaded state_dict: {ckpt_file}')
-            llm.llm_engine.workers[0].model.load_state_dict(state_dict)
-            status = f'Success. Loaded {ckpt_file}'
-        except Exception as e:
-            status = f'Failed - {str(e)}'
-    return status
 def chat_response(message, history, temperature: float, max_tokens: int, system_prompt: str = '') -> str:
     global llm
     assert llm is not None
@@ -466,7 +552,6 @@ def chat_response(message, history, temperature: float, max_tokens: int, system_
     sampling_params = SamplingParams(temperature=temperature, max_tokens=max_tokens)
     gen = llm.generate(message, sampling_params)
     out = gen[0].outputs[0].text
-    # print(f'{message}<<<{out}>>>')
     return f'{out}'
@@ -493,10 +578,6 @@ def _vllm_run_engine(self: Any, use_tqdm: bool = False) -> Dict[str, Any]:
     while self.llm_engine.has_unfinished_requests():
         step_outputs = self.llm_engine.step()
         for output in step_outputs:
-            # if output.finished:
-            #     outputs.append(output)
-                # if use_tqdm:
-                #     pbar.update(1)
             outputs[output.request_id] = output
         # outputs = sorted(outputs, key=lambda x: int(x.request_id))
         if len(outputs) > 0:
@@ -565,53 +646,71 @@ def vllm_generate_stream(
     yield from _vllm_run_engine(self, use_tqdm)
-def chat_response_stream(
     message: str,
-    history: List[Tuple[str, str]],
-    temperature: float,
-    max_tokens: int,
-    frequency_penalty: float,
-    system_prompt: str
 ) -> str:
-    global llm, RES_PRINTED
-    assert llm is not None
-    # force removing all
-    vllm_abort(llm)
-    temperature = float(temperature)
-    frequency_penalty = float(frequency_penalty)
-    max_tokens = int(max_tokens)
-    if system_prompt.strip() != '':
-        # chat version, add system prompt
-        message = llama_chat_sys_input_seq_constructor(
-            message.strip(),
-            sys_prompt=system_prompt
-        )
-    sampling_params = SamplingParams(
-        temperature=temperature, max_tokens=max_tokens,
-        frequency_penalty=frequency_penalty,
-    )
-    cur_out = None
-    for gen in vllm_generate_stream(llm, message, sampling_params):
-        if cur_out is not None:
-            yield cur_out
-        assert len(gen) == 1, f'{gen}'
-        item = next(iter(gen.values()))
-        cur_out = item.outputs[0].text
-    if not RES_PRINTED:
-        print(f'{message}<<<{cur_out}>>>')
-        RES_PRINTED = True
-    if cur_out is not None:
-        yield cur_out
 def chat_response_stream_multiturn(
     message: str,
     history: List[Tuple[str, str]],
     temperature: float,
     max_tokens: int,
     frequency_penalty: float,
-    system_prompt: str
 ) -> str:
     """Build multi turn
     <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
@@ -631,27 +730,46 @@ def chat_response_stream_multiturn(
     frequency_penalty = float(frequency_penalty)
     max_tokens = int(max_tokens)
     # history.append([message, None])
     # history will be appended with message later on
     full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
         message, history, sys_prompt=system_prompt
     )
     sampling_params = SamplingParams(
         temperature=temperature, max_tokens=max_tokens,
         frequency_penalty=frequency_penalty,
     )
     cur_out = None
-    for gen in vllm_generate_stream(llm, full_prompt, sampling_params):
-        if cur_out is not None:
             yield cur_out
         assert len(gen) == 1, f'{gen}'
         item = next(iter(gen.values()))
         cur_out = item.outputs[0].text
-    if not RES_PRINTED:
-        print(f'{full_prompt}<<<{cur_out}>>>')
-        RES_PRINTED = True
     if cur_out is not None:
         yield cur_out
 def debug_chat_response_echo(
@@ -662,16 +780,26 @@ def debug_chat_response_echo(
     frequency_penalty: float = 0.4,
     system_prompt: str = SYSTEM_PROMPT_1,
 ) -> str:
     yield f"repeat: {message}"
 # ============ CONSTANT ============
-MODEL_NAME = "DAMO-SeaL-13B"
-MODEL_TITLE = "DAMO-SeaL-13B - An Assistant for South East Asian Languages"
 MODEL_DESC = """
-This is a 13B DAMO-SeaL-Chat assistant model built by DAMO Academy, Alibaba Group. It can produce helpful responses in English, Vietnamese, Indonesian and Thai.
-<br>
-#### Citation
 If you find our project useful, hope you can star our repo and cite our paper as follows:
 ```
 @article{damonlpsg2023seallm,
@@ -680,22 +808,21 @@ If you find our project useful, hope you can star our repo and cite our paper as
   year = 2023,
 }
 ```
-""".strip()
-cite_markdown = """
 """
-#   journal = {arXiv preprint arXiv:2306.02858}
-#   url = {https://arxiv.org/abs/2306.02858}
-TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
-DTYPE = 'bfloat16'
-DTYPE = 'float16'
-MODEL_PATH = os.environ.get("MODEL_PATH", "notfound, please set `export MODEL_PATH=`")
 def launch():
@@ -707,26 +834,29 @@ def launch():
     assert tensor_parallel > 0 , f'{tensor_parallel} invalid'
     dtype = DTYPE
     sys_prompt = SYSTEM_PROMPT_1
-    max_tokens = 4096
     if DEBUG:
         model_desc += "\n<br>!!!!! This is in debug mode, responses will be copy original"
         response_fn = debug_chat_response_echo
     else:
         # ! load the model
         assert os.path.exists(model_path), f'{model_path} not found'
         llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel)
         print(f'Use system prompt:\n{sys_prompt}')
-        # response_fn = chat_response_stream_multiturn if args.multiturn else chat_response_stream
         response_fn = chat_response_stream_multiturn
         print(F'respond: {response_fn}')
     demo = gr.ChatInterface(
         response_fn,
         chatbot=ChatBot(
-            # value=MODEL_NAME,
             bubble_full_width=False,
             latex_delimiters=[
                 { "left": "$", "right": "$", "display": False},
@@ -735,7 +865,8 @@ def launch():
         ),
         textbox=gr.Textbox(placeholder='Type message', lines=8, max_lines=128, min_width=200),
         submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
-        # stop_btn=None,
         title=f"{model_title}",
         description=f"{model_desc}",
         # ! decide if can change the system prompt.
@@ -743,38 +874,16 @@ def launch():
             gr.Number(value=0, label='Temperature (higher -> more random)'),
             gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
             gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
-            gr.Textbox(value=sys_prompt, label='System prompt', lines=8)],
     )
-    # with gr.Blocks() as demo:
-    #     gr.ChatInterface(
-    #         response_fn,
-    #         chatbot=ChatBot(
-    #             bubble_full_width=False,
-    #             latex_delimiters=[
-    #                 { "left": "$", "right": "$", "display": False},
-    #                 { "left": "$$", "right": "$$", "display": True},
-    #             ]
-    #         ),
-    #         textbox=gr.Textbox(placeholder='Type message', lines=8, max_lines=128, min_width=200),
-    #         submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
-    #         # stop_btn=None,
-    #         title=f"{model_title}",
-    #         description=f"{model_desc}",
-    #         # ! decide if can change the system prompt.
-    #         additional_inputs=[
-    #             gr.Number(value=0, label='Temperature (higher -> more random)'),
-    #             gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
-    #             gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
-    #             gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
-    #         ],
-    #     )
-    #     gr.Markdown(cite_markdown)
     demo.queue()
-    # demo.launch(server_port=args.port)
-    demo.launch()
 def main():
@@ -793,7 +902,19 @@ export CUDA_VISIBLE_DEVICES=0
 export MODEL_PATH=${dataroot}/hf_train/pretrain_lm/swpn/merlion13s108Hi8kPretFlCW8k.LMFromHf.a.gc.t5k0.vizhthid.mean_std.TrainTask.NLNL.Multi.Vi.FSePlCq13M.FSePlCq13M.m4k.b8.lr1e5.linear.wa0k.ms858k.grac1.se1.8g.v4c.zfsdp/step_4000
 export MODEL_PATH=${dataroot}/llama-2-7b-lxxp-faster
 export MODEL_PATH=${dataroot}/llama-2-7b-chat-xp
 python app.py
 """

 from tqdm.auto import tqdm
 from huggingface_hub import snapshot_download
+# @@ constants ================
+DEBUG = bool(int(os.environ.get("DEBUG", "1")))
+BLOCK_ZH = bool(int(os.environ.get("BLOCK_ZH", "0")))
+TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
+DTYPE = os.environ.get("DTYPE", "bfloat16")
+# DTYPE = 'float16'
+# MODEL_PATH = os.environ.get("MODEL_PATH", "notfound, please set `export MODEL_PATH=`")
+MODEL_PATH = os.environ.get("MODEL_PATH", "seal_13b_a")
+PORT = int(os.environ.get("PORT", "7860"))
+STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1"))
+MAX_TOKENS = 2048
+# @@ constants ================
+if not DEBUG:
     # vllm import
     from vllm import LLM, SamplingParams
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
     _MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
+def _detect_lang(text):
+    from langdetect import detect as detect_lang
+    from langdetect.detector import LangDetectException
+    dlang = None
+    try:
+        dlang = detect_lang(text)
+    except Exception as e:
+        # No features in text.
+        print(f'Error: {e}')
+        if "No features in text." in str(e):
+            return "en"
+        else:
+            return "zh"
+    return dlang
 def hf_model_weights_iterator(
     model_name_or_path: str,
     cache_dir: Optional[str] = None,
         if "rotary_emb.inv_freq" in name:
             continue
         if "embed_tokens" in name or "lm_head" in name:
             param = state_dict[name]
+            # Consider padding in the vocab size.
+            padded_vocab_size = (param.shape[0] * tp_size)
+            # num_extra_rows = padded_vocab_size - self.config.vocab_size
+            num_extra_rows = padded_vocab_size - loaded_weight.size(0)
+            load_size = loaded_weight.size()
+            extra_rows = torch.empty(num_extra_rows,
+                                        loaded_weight.shape[1])
+            extra_rows = extra_rows.to(loaded_weight)
+            loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
+            if num_extra_rows > 0:
+                print(f'Add empty to {num_extra_rows} extra row for {name}')
+            print(f'Load: {name} | {padded_vocab_size=} | {self.config.vocab_size=} | {num_extra_rows=} | {param.size()=} | {loaded_weight.size()=} | {load_size=}')
+        # if "embed_tokens" in name or "lm_head" in name:
+        #     param = state_dict[name]
+        #     load_padded_tensor_parallel_vocab(param, loaded_weight, tensor_model_parallel_rank)
+        #     loaded += 1
+            # continue
         is_attention_weight = False
         for weight_name, shard_size, offset in attention_weight_specs:
     ):
         x = super()._postprocess_chat_messages(chat_message)
         if isinstance(x, str):
+            x = x.strip().replace("\n", "<br>")
         return x
+# gr.ChatInterface
+from gradio.components import Button
+from gradio.events import Dependency, EventListenerMethod
+def _setup_stop_events(
+    self, event_triggers: list[EventListenerMethod], event_to_cancel: Dependency
+) -> None:
+    event_triggers = event_triggers if isinstance(event_triggers, (list, tuple)) else [event_triggers]
+    if self.stop_btn and self.is_generator:
+        if self.submit_btn:
+            for event_trigger in event_triggers:
+                event_trigger(
+                    lambda: (
+                        Button.update(visible=False),
+                        Button.update(visible=True),
+                    ),
+                    None,
+                    [self.submit_btn, self.stop_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            event_to_cancel.then(
+                lambda: (Button.update(visible=True), Button.update(visible=False)),
+                None,
+                [self.submit_btn, self.stop_btn],
+                api_name=False,
+                queue=False,
+            )
+        else:
+            for event_trigger in event_triggers:
+                event_trigger(
+                    lambda: Button.update(visible=True),
+                    None,
+                    [self.stop_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            event_to_cancel.then(
+                lambda: Button.update(visible=False),
+                None,
+                [self.stop_btn],
+                api_name=False,
+                queue=False,
+            )
+        self.stop_btn.click(
+            None,
+            None,
+            None,
+            cancels=event_to_cancel,
+            api_name=False,
+        )
     else:
+        if self.submit_btn:
+            for event_trigger in event_triggers:
+                event_trigger(
+                    lambda: Button.update(interactive=False),
+                    None,
+                    [self.submit_btn],
+                    api_name=False,
+                    queue=False,
+                )
+            event_to_cancel.then(
+                lambda: Button.update(interactive=True),
+                None,
+                [self.submit_btn],
+                api_name=False,
+                queue=False,
+            )
+gr.ChatInterface._setup_stop_events = _setup_stop_events
 def chat_response(message, history, temperature: float, max_tokens: int, system_prompt: str = '') -> str:
     global llm
     assert llm is not None
     sampling_params = SamplingParams(temperature=temperature, max_tokens=max_tokens)
     gen = llm.generate(message, sampling_params)
     out = gen[0].outputs[0].text
     return f'{out}'
     while self.llm_engine.has_unfinished_requests():
         step_outputs = self.llm_engine.step()
         for output in step_outputs:
             outputs[output.request_id] = output
         # outputs = sorted(outputs, key=lambda x: int(x.request_id))
         if len(outputs) > 0:
     yield from _vllm_run_engine(self, use_tqdm)
+# def chat_response_stream(
+#     message: str,
+#     history: List[Tuple[str, str]],
+#     temperature: float,
+#     max_tokens: int,
+#     frequency_penalty: float,
+#     system_prompt: str
+# ) -> str:
+#     global llm, RES_PRINTED
+#     assert llm is not None
+#     # force removing all
+#     vllm_abort(llm)
+#     temperature = float(temperature)
+#     frequency_penalty = float(frequency_penalty)
+#     max_tokens = int(max_tokens)
+#     if system_prompt.strip() != '':
+#         # chat version, add system prompt
+#         message = llama_chat_sys_input_seq_constructor(
+#             message.strip(),
+#             sys_prompt=system_prompt
+#         )
+#     sampling_params = SamplingParams(
+#         temperature=temperature, max_tokens=max_tokens,
+#         frequency_penalty=frequency_penalty,
+#     )
+#     cur_out = None
+#     for j, gen in enumerate(vllm_generate_stream(llm, message, sampling_params)):
+#         if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
+#             yield cur_out
+#         assert len(gen) == 1, f'{gen}'
+#         item = next(iter(gen.values()))
+#         cur_out = item.outputs[0].text
+#     if not RES_PRINTED:
+#         print(f'{message}<<<{cur_out}>>>')
+#         RES_PRINTED = True
+#     if cur_out is not None:
+#         yield cur_out
+BLOCK_MESSAGE = """Sorry, Chinese is not currently supported. Please clear the chat box for a new conversation.
+抱歉，目前不支持中文。 请清除聊天框以进行新对话。"""
+def block_zh(
     message: str,
+    history: List[Tuple[str, str]]
 ) -> str:
+    # if any((BLOCK_MESSAGE in x[0].strip() or BLOCK_MESSAGE in x[1].strip()) for x in history):
+    if any((BLOCK_MESSAGE in x[1].strip()) for x in history):
+        return True
+    elif 'zh' in _detect_lang(message):
+        print(f'Detect zh: {message}')
+        return True
+    # ! optionally detect every responses message
+    else:
+        return False
+# 抱歉，目前不支持中文。
 def chat_response_stream_multiturn(
     message: str,
     history: List[Tuple[str, str]],
     temperature: float,
     max_tokens: int,
     frequency_penalty: float,
+    system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ) -> str:
     """Build multi turn
     <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
     frequency_penalty = float(frequency_penalty)
     max_tokens = int(max_tokens)
+    message = message.strip()
+    # detect_ = _detect_lang(message)
+    # print(f'Message language: {detect_}')
+    # ! lang detect
+    if BLOCK_ZH:
+        if block_zh(message, history):
+            yield BLOCK_MESSAGE
+            return
     # history.append([message, None])
     # history will be appended with message later on
     full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
         message, history, sys_prompt=system_prompt
     )
+    # print(full_prompt)
     sampling_params = SamplingParams(
         temperature=temperature, max_tokens=max_tokens,
         frequency_penalty=frequency_penalty,
     )
     cur_out = None
+    # for gen in vllm_generate_stream(llm, full_prompt, sampling_params):
+    for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
+        if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
             yield cur_out
         assert len(gen) == 1, f'{gen}'
         item = next(iter(gen.values()))
         cur_out = item.outputs[0].text
+    # if not RES_PRINTED:
+    print(f'{full_prompt}<<<{cur_out}>>>\n')
+        # RES_PRINTED = True
     if cur_out is not None:
         yield cur_out
+    # print(f'Output: {_detect_lang(cur_out)}')
+    if BLOCK_ZH:
+        if "zh" in _detect_lang(cur_out):
+            yield BLOCK_MESSAGE
 def debug_chat_response_echo(
     frequency_penalty: float = 0.4,
     system_prompt: str = SYSTEM_PROMPT_1,
 ) -> str:
+    import time
+    time.sleep(0.5)
     yield f"repeat: {message}"
 # ============ CONSTANT ============
+# https://github.com/gradio-app/gradio/issues/884
+MODEL_NAME = "SeaL-13B"
+MODEL_TITLE = "SeaL-13B - An Assistant for South East Asian Languages"
+# ! add icon: "<img  src='file/lion.jpg' alt='image One'>"
 MODEL_DESC = """
+<span style="font-size: larger">
+This is a DAMO SeaL-13B chatbot assistant built by DAMO Academy, Alibaba Group. It can produce helpful responses in English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩 and Thai 🇹🇭.
+</span>
+""".strip()
+# <br>
+cite_markdown = """
+### Citation
 If you find our project useful, hope you can star our repo and cite our paper as follows:
 ```
 @article{damonlpsg2023seallm,
   year = 2023,
 }
 ```
 """
+warning_markdown = """
+### Warning:
+<span style="color: red">The chatbot may produce inaccurate and harmful information about people, places, or facts.</span>
+<span style="color: red">We strongly advise against misuse of the chatbot to knowingly generate harmful or unethical content, \
+or content that violates locally applicable and international laws or regulations, including hate speech, violence, pornography, deception, etc!</span>
+"""
+path_markdown = """
+#### Model path:
+{model_path}
+"""
 def launch():
     assert tensor_parallel > 0 , f'{tensor_parallel} invalid'
     dtype = DTYPE
     sys_prompt = SYSTEM_PROMPT_1
+    max_tokens = MAX_TOKENS
+    print(f'Launch config: {model_path=} / {model_title=} / {tensor_parallel=} / {dtype=} / {max_tokens}\n{SYSTEM_PROMPT_1} | {BLOCK_ZH=}')
     if DEBUG:
         model_desc += "\n<br>!!!!! This is in debug mode, responses will be copy original"
         response_fn = debug_chat_response_echo
     else:
         # ! load the model
+        import vllm
         assert os.path.exists(model_path), f'{model_path} not found'
+        print(F'VLLM: {vllm.__version__}')
+        print(f'Load path: {model_path}')
         llm = LLM(model=model_path, dtype=dtype, tensor_parallel_size=tensor_parallel)
         print(f'Use system prompt:\n{sys_prompt}')
         response_fn = chat_response_stream_multiturn
         print(F'respond: {response_fn}')
     demo = gr.ChatInterface(
         response_fn,
         chatbot=ChatBot(
+            label=MODEL_NAME,
             bubble_full_width=False,
             latex_delimiters=[
                 { "left": "$", "right": "$", "display": False},
         ),
         textbox=gr.Textbox(placeholder='Type message', lines=8, max_lines=128, min_width=200),
         submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+        # ! consider preventing the stop button
+        stop_btn=None,
         title=f"{model_title}",
         description=f"{model_desc}",
         # ! decide if can change the system prompt.
             gr.Number(value=0, label='Temperature (higher -> more random)'),
             gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
             gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
+            # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
+        ],
     )
+    with demo:
+        gr.Markdown(warning_markdown)
+        gr.Markdown(cite_markdown)
+        gr.Markdown(path_markdown.format(model_path=model_path))
     demo.queue()
+    demo.launch(server_port=PORT)
 def main():
 export MODEL_PATH=${dataroot}/hf_train/pretrain_lm/swpn/merlion13s108Hi8kPretFlCW8k.LMFromHf.a.gc.t5k0.vizhthid.mean_std.TrainTask.NLNL.Multi.Vi.FSePlCq13M.FSePlCq13M.m4k.b8.lr1e5.linear.wa0k.ms858k.grac1.se1.8g.v4c.zfsdp/step_4000
 export MODEL_PATH=${dataroot}/llama-2-7b-lxxp-faster
 export MODEL_PATH=${dataroot}/llama-2-7b-chat-xp
+export DEBUG=0
+export CUDA_VISIBLE_DEVICES=0
+export MODEL_PATH=seal_13b_a
+export MODEL_PATH=${dataroot}/hf_train/pretrain_lm/swpn/merlion13s108Hi8kPretFlCW12k.LMFromHf.a.gc.t5k0.vizhthid.mean_std.TrainTask.NLNL.Multi.Vi.SeaV2Cq13M.SeaV2Cq13M.m4k.b8.lr1e5.linear.wa0k.ms858k.grac1.se1.8g.v4c.zfsdp/step_6000
+export MODEL_PATH=${dataroot}/hf_train/pretrain_lm/swpn/mer13s108Hi16kPretFlCWNLP12k_SFT2.LMFromHf.a.gc.t5k0.vizhthid.mean_std.TrainTask.NLNL.Multi.Vi.Sft2Censor.Sft2Censor.m4k.b8.lr1e5.linear.wa0k.ms1144k.grac1.se1.6g.v4c.zfsdp/step_2000
+export PORT=8799
+export BLOCK_ZH=1
 python app.py
+DEBUG=1 python app.py
 """

requirements.txt CHANGED Viewed

@@ -22,5 +22,6 @@ tensorboard
 geomloss
 einops
 gdown
 vllm==0.1.4
 transformers

 geomloss
 einops
 gdown
+langdetect
 vllm==0.1.4
 transformers