SeaLLM-Chat

Running on Zero

App Files Files Community

nxphi47 commited on Nov 27, 2023

Commit

a832036

1 Parent(s): f5d291f

Update app.py

Browse files

Files changed (1) hide show

app.py +423 -54

app.py CHANGED Viewed

@@ -93,15 +93,24 @@ ENABLE_AGREE_POPUP = bool(int(os.environ.get("ENABLE_AGREE_POPUP", "0")))
 MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
 TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
 FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.4"))
 gpu_memory_utilization = float(os.environ.get("gpu_memory_utilization", "0.9"))
 # whether to enable quantization, currently not in use
 QUANTIZATION = str(os.environ.get("QUANTIZATION", ""))
 DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", ""))
 DATA_SET_REPO = None
 """
 Internal instructions of how to configure the DEMO
@@ -196,6 +205,32 @@ MODEL_TITLE = """
 </div>
 """
 # <a href=''><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
 MODEL_DESC = """
 <div style='display:flex; gap: 0.25rem; '>
 <a href='https://github.com/SeaLLMs/SeaLLMs'><img src='https://img.shields.io/badge/Github-Code-success'></a>
@@ -207,20 +242,13 @@ This is <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank"
 Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">our article</a> for more details.
 </span>
 <br>
-<span >
-NOTE: The chatbot may produce inaccurate and harmful information about people, places, or facts.
-<span style="color: red">By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">SeaLLM Terms Of Use</a>, which include:</span><br>
-<ul>
-<li >
-You must not use our service to generate any harmful, unethical or illegal content that violates locally applicable and international laws or regulations,
-including but not limited to hate speech, violence, pornography and deception.</li>
-<li >
 The service collects user dialogue data for testing and performance improvement, and reserves the right to distribute it under
-<a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution (CC-BY)</a> or similar license. So do not enter any personal information!
-</li>
-</ul>
 </span>
 """.strip()
@@ -709,6 +737,7 @@ def llama_chat_multiturn_sys_input_seq_constructor(
     sys_prompt=SYSTEM_PROMPT_1,
     bos_token=BOS_TOKEN,
     eos_token=EOS_TOKEN,
 ):
     """
     ```
@@ -718,18 +747,19 @@ def llama_chat_multiturn_sys_input_seq_constructor(
     ```
     """
     text = ''
     for i, (prompt, res) in enumerate(history):
         if i == 0:
-            text += f"{bos_token}{B_INST} {B_SYS} {sys_prompt} {E_SYS} {prompt} {E_INST}"
         else:
-            text += f"{bos_token}{B_INST} {prompt} {E_INST}"
         if res is not None:
             text += f" {res} {eos_token} "
     if len(history) == 0 or text.strip() == '':
-        text = f"{bos_token}{B_INST} {B_SYS} {sys_prompt} {E_SYS} {message} {E_INST}"
     else:
-        text += f"{bos_token}{B_INST} {message} {E_INST}"
     return text
@@ -944,6 +974,10 @@ gr.ChatInterface._setup_events = _setup_events
 def vllm_abort(self: Any):
     from vllm.sequence import SequenceStatus
     scheduler = self.llm_engine.scheduler
     for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
@@ -1093,6 +1127,7 @@ def chat_response_stream_multiturn(
     temperature: float,
     max_tokens: int,
     frequency_penalty: float,
     current_time: Optional[float] = None,
     system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ) -> str:
@@ -1144,6 +1179,7 @@ def chat_response_stream_multiturn(
         temperature=temperature,
         max_tokens=max_tokens,
         frequency_penalty=frequency_penalty,
         stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]']
     )
     cur_out = None
@@ -1163,6 +1199,9 @@ def chat_response_stream_multiturn(
         assert len(gen) == 1, f'{gen}'
         item = next(iter(gen.values()))
         cur_out = item.outputs[0].text
     # TODO: use current_time to register conversations, accoriding history and cur_out
     history_str = format_conversation(history + [[message, cur_out]])
@@ -1236,7 +1275,7 @@ def maybe_upload_to_dataset():
             )
         except Exception as e:
             print(f'Failed to save to repo: {DATA_SET_REPO_PATH}|{str(e)}')
 def print_log_file():
     global LOG_FILE, LOG_PATH
@@ -1262,6 +1301,7 @@ def debug_chat_response_echo(
     temperature: float = 0.0,
     max_tokens: int = 4096,
     frequency_penalty: float = 0.4,
     current_time: Optional[float] = None,
     system_prompt: str = SYSTEM_PROMPT_1,
 ) -> str:
@@ -1316,6 +1356,256 @@ async () => {
 }
 """
 def launch():
     global demo, llm, DEBUG, LOG_FILE
     model_desc = MODEL_DESC
@@ -1329,6 +1619,7 @@ def launch():
     max_tokens = MAX_TOKENS
     temperature = TEMPERATURE
     frequence_penalty = FREQUENCE_PENALTY
     ckpt_info = "None"
     print(
@@ -1344,6 +1635,7 @@ def launch():
         f'\n| DISPLAY_MODEL_PATH={DISPLAY_MODEL_PATH} '
         f'\n| LANG_BLOCK_HISTORY={LANG_BLOCK_HISTORY} '
         f'\n| frequence_penalty={frequence_penalty} '
         f'\n| temperature={temperature} '
         f'\n| hf_model_name={hf_model_name} '
         f'\n| model_path={model_path} '
@@ -1409,44 +1701,120 @@ def launch():
         if SAVE_LOGS:
             LOG_FILE = open(LOG_PATH, 'a', encoding='utf-8')
-    demo = gr.ChatInterface(
-        response_fn,
-        chatbot=ChatBot(
-            label=MODEL_NAME,
-            bubble_full_width=False,
-            latex_delimiters=[
-                { "left": "$", "right": "$", "display": False},
-                { "left": "$$", "right": "$$", "display": True},
             ],
-            show_copy_button=True,
-        ),
-        textbox=gr.Textbox(placeholder='Type message', lines=8, max_lines=128, min_width=200),
-        submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
-        # ! consider preventing the stop button
-        stop_btn=None,
-        title=f"{model_title}",
-        description=f"{model_desc}",
-        additional_inputs=[
-            gr.Number(value=temperature, label='Temperature (higher -> more random)'),
-            gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
-            gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens)'),
-            gr.Number(value=0, label='current_time', visible=False),
-            # ! Remove the system prompt textbox to avoid jailbreaking
-            # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
-        ],
-    )
-    demo.title = MODEL_NAME
-    with demo:
-        gr.Markdown(cite_markdown)
-        if DISPLAY_MODEL_PATH:
-            gr.Markdown(path_markdown.format(model_path=model_path))
-        if ENABLE_AGREE_POPUP:
-            demo.load(None, None, None, _js=AGREE_POP_SCRIPTS)
-    demo.queue()
-    demo.launch(server_port=PORT)
 def main():
@@ -1455,4 +1823,5 @@ def main():
 if __name__ == "__main__":
-    main()

 MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048"))
 TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.1"))
 FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.4"))
+PRESENCE_PENALTY = float(os.environ.get("PRESENCE_PENALTY", "0.0"))
 gpu_memory_utilization = float(os.environ.get("gpu_memory_utilization", "0.9"))
 # whether to enable quantization, currently not in use
 QUANTIZATION = str(os.environ.get("QUANTIZATION", ""))
+# Batch inference file upload
+ENABLE_BATCH_INFER = bool(int(os.environ.get("ENABLE_BATCH_INFER", "1")))
+BATCH_INFER_MAX_ITEMS = int(os.environ.get("BATCH_INFER_MAX_ITEMS", "200"))
+BATCH_INFER_MAX_FILE_SIZE = int(os.environ.get("BATCH_INFER_MAX_FILE_SIZE", "500"))
+BATCH_INFER_MAX_PROMPT_TOKENS = int(os.environ.get("BATCH_INFER_MAX_PROMPT_TOKENS", "4000"))
+BATCH_INFER_SAVE_TMP_FILE = os.environ.get("BATCH_INFER_SAVE_TMP_FILE", "./tmp/pred.json")
+#
 DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", ""))
 DATA_SET_REPO = None
 """
 Internal instructions of how to configure the DEMO
 </div>
 """
 # <a href=''><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
+# MODEL_DESC = """
+# <div style='display:flex; gap: 0.25rem; '>
+# <a href='https://github.com/SeaLLMs/SeaLLMs'><img src='https://img.shields.io/badge/Github-Code-success'></a>
+# <a href='https://huggingface.co/spaces/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+# <a href='https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
+# </div>
+# <span style="font-size: larger">
+# This is <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a chatbot assistant optimized for Southeast Asian Languages. It produces helpful responses in English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩 and Thai 🇹🇭.
+# Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">our article</a> for more details.
+# </span>
+# <br>
+# <span >
+# NOTE: The chatbot may produce inaccurate and harmful information about people, places, or facts.
+# <span style="color: red">By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">SeaLLM Terms Of Use</a>, which include:</span><br>
+# <ul>
+# <li >
+# You must not use our service to generate any harmful, unethical or illegal content that violates locally applicable and international laws or regulations,
+# including but not limited to hate speech, violence, pornography and deception.</li>
+# <li >
+# The service collects user dialogue data for testing and performance improvement, and reserves the right to distribute it under
+# <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution (CC-BY)</a> or similar license. So do not enter any personal information!
+# </li>
+# </ul>
+# </span>
+# """.strip()
 MODEL_DESC = """
 <div style='display:flex; gap: 0.25rem; '>
 <a href='https://github.com/SeaLLMs/SeaLLMs'><img src='https://img.shields.io/badge/Github-Code-success'></a>
 Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">our article</a> for more details.
 </span>
 <br>
+<span>
+<span style="color: red">NOTE:</span> The chatbot may produce inaccurate and harmful information.
+By using our service, you are required to <span style="color: red">agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">Terms Of Use</a>,</span> which includes
+not to use our service to generate any harmful, inappropriate or unethical or illegal content that violates locally applicable and international laws and regulations.
 The service collects user dialogue data for testing and performance improvement, and reserves the right to distribute it under
+<a href="https://creativecommons.org/licenses/by/4.0/">(CC-BY)</a> or similar license. So do not enter any personal information!
 </span>
 """.strip()
     sys_prompt=SYSTEM_PROMPT_1,
     bos_token=BOS_TOKEN,
     eos_token=EOS_TOKEN,
+    include_end_instruct=True,
 ):
     """
     ```
     ```
     """
     text = ''
+    end_instr = f" {E_INST}" if include_end_instruct else ""
     for i, (prompt, res) in enumerate(history):
         if i == 0:
+            text += f"{bos_token}{B_INST} {B_SYS} {sys_prompt} {E_SYS} {prompt}{end_instr}"
         else:
+            text += f"{bos_token}{B_INST} {prompt}{end_instr}"
         if res is not None:
             text += f" {res} {eos_token} "
     if len(history) == 0 or text.strip() == '':
+        text = f"{bos_token}{B_INST} {B_SYS} {sys_prompt} {E_SYS} {message}{end_instr}"
     else:
+        text += f"{bos_token}{B_INST} {message}{end_instr}"
     return text
 def vllm_abort(self: Any):
+    sh = self.llm_engine.scheduler
+    for g in (sh.waiting + sh.running + sh.swapped):
+        sh.abort_seq_group(g.request_id)
     from vllm.sequence import SequenceStatus
     scheduler = self.llm_engine.scheduler
     for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
     temperature: float,
     max_tokens: int,
     frequency_penalty: float,
+    presence_penalty: float,
     current_time: Optional[float] = None,
     system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ) -> str:
         temperature=temperature,
         max_tokens=max_tokens,
         frequency_penalty=frequency_penalty,
+        presence_penalty=presence_penalty,
         stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]']
     )
     cur_out = None
         assert len(gen) == 1, f'{gen}'
         item = next(iter(gen.values()))
         cur_out = item.outputs[0].text
+        if j >= max_tokens - 2:
+            gr.Warning(f'The response hits limit of {max_tokens} tokens. Consider increase the max tokens parameter in the Additional Inputs.')
     # TODO: use current_time to register conversations, accoriding history and cur_out
     history_str = format_conversation(history + [[message, cur_out]])
             )
         except Exception as e:
             print(f'Failed to save to repo: {DATA_SET_REPO_PATH}|{str(e)}')
 def print_log_file():
     global LOG_FILE, LOG_PATH
     temperature: float = 0.0,
     max_tokens: int = 4096,
     frequency_penalty: float = 0.4,
+    presence_penalty: float = 0.0,
     current_time: Optional[float] = None,
     system_prompt: str = SYSTEM_PROMPT_1,
 ) -> str:
 }
 """
+def debug_file_function(
+        files: Union[str, List[str]],
+        prompt_mode: str,
+        temperature: float,
+        max_tokens: int,
+        frequency_penalty: float,
+        presence_penalty: float,
+        stop_strings: str = "[STOP],<s>,</s>",
+        current_time: Optional[float] = None,
+):
+    files = files if isinstance(files, list) else [files]
+    print(files)
+    filenames = [f.name for f in files]
+    all_items = []
+    for fname in filenames:
+        print(f'Reading {fname}')
+        with open(fname, 'r', encoding='utf-8') as f:
+            items = json.load(f)
+        assert isinstance(items, list), f'invalid items from {fname} not list'
+        all_items.extend(items)
+    print(all_items)
+    print(f'{prompt_mode} / {temperature} / {max_tokens}, {frequency_penalty}, {presence_penalty}')
+    save_path = "./test.json"
+    with open(save_path, 'w', encoding='utf-8') as f:
+        json.dump(all_items, f, indent=4, ensure_ascii=False)
+    for x in all_items:
+        x['response'] = "Return response"
+    print_items = all_items[:1]
+    # print_json = json.dumps(print_items, indent=4, ensure_ascii=False)
+    return save_path, print_items
+def validate_file_item(filename, index, item: Dict[str, str]):
+    # BATCH_INFER_MAX_PROMPT_TOKENS
+    message = item['prompt'].strip()
+    if len(message) == 0:
+        raise gr.Error(f'Prompt {index} empty')
+    message_safety = safety_check(message, history=None)
+    if message_safety is not None:
+        raise gr.Error(f'Prompt {index} unsafe or supported: {message_safety}')
+    tokenizer = llm.get_tokenizer() if llm is not None else None
+    if tokenizer is None or len(tokenizer.encode(message, add_special_tokens=False)) >= BATCH_INFER_MAX_PROMPT_TOKENS:
+        raise gr.Error(f"Prompt {index} too long, should be less than {BATCH_INFER_MAX_PROMPT_TOKENS} tokens")
+def read_validate_json_files(files: Union[str, List[str]]):
+    files = files if isinstance(files, list) else [files]
+    filenames = [f.name for f in files]
+    all_items = []
+    for fname in filenames:
+        # check each files
+        print(f'Reading {fname}')
+        with open(fname, 'r', encoding='utf-8') as f:
+            items = json.load(f)
+        assert isinstance(items, list), f'Data {fname} not list'
+        assert all(isinstance(x, dict) for x in items), f'item in input file not list'
+        assert all("prompt" in x for x in items), f'key prompt should be in dict item of input file'
+        for i, x in enumerate(items):
+            validate_file_item(fname, i, x)
+        all_items.extend(items)
+    if len(all_items) > BATCH_INFER_MAX_ITEMS:
+        raise gr.Error(f"Num samples {len(all_items)} > {BATCH_INFER_MAX_ITEMS} allowed.")
+    return all_items
+def remove_gradio_cache():
+    import shutil
+    for root, dirs, files in os.walk('/tmp/gradio/'):
+        for f in files:
+            os.unlink(os.path.join(root, f))
+        for d in dirs:
+            shutil.rmtree(os.path.join(root, d))
+def maybe_upload_batch_set(pred_json_path):
+    global LOG_FILE, DATA_SET_REPO_PATH, SAVE_LOGS
+    if SAVE_LOGS and DATA_SET_REPO_PATH is not "":
+        try:
+            from huggingface_hub import upload_file
+            path_in_repo = "misc/" + os.path.basename(pred_json_path).replace(".json", f'.{time.time()}.json')
+            print(f'upload {pred_json_path} to {DATA_SET_REPO_PATH}//{path_in_repo}')
+            upload_file(
+                path_or_fileobj=pred_json_path,
+                path_in_repo=path_in_repo,
+                repo_id=DATA_SET_REPO_PATH,
+                token=HF_TOKEN,
+                repo_type="dataset",
+                create_pr=True
+            )
+        except Exception as e:
+            print(f'Failed to save to repo: {DATA_SET_REPO_PATH}|{str(e)}')
+def batch_inference(
+        files: Union[str, List[str]],
+        prompt_mode: str,
+        temperature: float,
+        max_tokens: int,
+        frequency_penalty: float,
+        presence_penalty: float,
+        stop_strings: str = "[STOP],<s>,</s>",
+        current_time: Optional[float] = None,
+        system_prompt: Optional[str] = SYSTEM_PROMPT_1
+):
+    """
+    Must handle
+    """
+    global LOG_FILE, LOG_PATH, DEBUG, llm, RES_PRINTED
+    if DEBUG:
+        return debug_file_function(
+            files, prompt_mode, temperature, max_tokens,
+            presence_penalty, stop_strings, current_time)
+    from vllm import LLM, SamplingParams
+    assert llm is not None
+    # assert system_prompt.strip() != '', f'system prompt is empty'
+    stop_strings = [x.strip() for x in stop_strings.strip().split(",")]
+    tokenizer = llm.get_tokenizer()
+    # force removing all
+    # NOTE: need to make sure all cached items are removed!!!!!!!!!
+    vllm_abort(llm)
+    temperature = float(temperature)
+    frequency_penalty = float(frequency_penalty)
+    max_tokens = int(max_tokens)
+    all_items = read_validate_json_files(files)
+    # remove all items in /tmp/gradio/
+    remove_gradio_cache()
+    if prompt_mode == 'chat':
+        prompt_format_fn = llama_chat_multiturn_sys_input_seq_constructor
+    elif prompt_mode == 'few-shot':
+        from functools import partial
+        prompt_format_fn = partial(
+            llama_chat_multiturn_sys_input_seq_constructor, include_end_instruct=False
+        )
+    else:
+        raise gr.Error(f'Wrong mode {prompt_mode}')
+    full_prompts = [
+        prompt_format_fn(
+            x['prompt'], [], sys_prompt=system_prompt
+        )
+        for i, x in enumerate(all_items)
+    ]
+    print(f'{full_prompts[0]}\n')
+    if any(len(tokenizer.encode(x, add_special_tokens=False)) >= 4090 for x in full_prompts):
+        raise gr.Error(f"Some prompt is too long!")
+    stop_seq = list(set(['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'] + stop_strings))
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        max_tokens=max_tokens,
+        frequency_penalty=frequency_penalty,
+        presence_penalty=presence_penalty,
+        stop=stop_seq
+    )
+    generated = llm.generate(full_prompts, sampling_params, use_tqdm=False)
+    responses = [g.outputs[0].text for g in generated]
+    if len(responses) != len(all_items):
+        raise gr.Error(f'inconsistent lengths {len(responses)} != {len(all_items)}')
+    for res, item in zip(responses, all_items):
+        item['response'] = res
+    # save_path = "/mnt/workspace/workgroup/phi/test.json"
+    save_path = BATCH_INFER_SAVE_TMP_FILE
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    with open(save_path, 'w', encoding='utf-8') as f:
+        json.dump(all_items, f, indent=4, ensure_ascii=False)
+    # You need to upload save_path as a new timestamp file.
+    maybe_upload_batch_set(save_path)
+    print_items = all_items[:2]
+    # print_json = json.dumps(print_items, indent=4, ensure_ascii=False)
+    return save_path, print_items
+# BATCH_INFER_MAX_ITEMS
+FILE_UPLOAD_DESC = f"""File upload json format, with JSON object as list of dict with < {BATCH_INFER_MAX_ITEMS} items"""
+FILE_UPLOAD_DESCRIPTION = FILE_UPLOAD_DESC + """
+```
+[ {\"id\": 0, \"prompt\": \"Hello world\"} ,  {\"id\": 1, \"prompt\": \"Hi there?\"}]
+```
+"""
+# https://huggingface.co/spaces/yuntian-deng/ChatGPT4Turbo/blob/main/app.py
+@document()
+class CusTabbedInterface(gr.Blocks):
+    def __init__(
+        self,
+        interface_list: list[gr.Interface],
+        tab_names: Optional[list[str]] = None,
+        title: Optional[str] = None,
+        description: Optional[str] = None,
+        theme: Optional[gr.Theme] = None,
+        analytics_enabled: Optional[bool] = None,
+        css: Optional[str] = None,
+    ):
+        """
+        Parameters:
+            interface_list: a list of interfaces to be rendered in tabs.
+            tab_names: a list of tab names. If None, the tab names will be "Tab 1", "Tab 2", etc.
+            title: a title for the interface; if provided, appears above the input and output components in large font. Also used as the tab title when opened in a browser window.
+            analytics_enabled: whether to allow basic telemetry. If None, will use GRADIO_ANALYTICS_ENABLED environment variable or default to True.
+            css: custom css or path to custom css file to apply to entire Blocks
+        Returns:
+            a Gradio Tabbed Interface for the given interfaces
+        """
+        super().__init__(
+            title=title or "Gradio",
+            theme=theme,
+            analytics_enabled=analytics_enabled,
+            mode="tabbed_interface",
+            css=css,
+        )
+        self.description = description
+        if tab_names is None:
+            tab_names = [f"Tab {i}" for i in range(len(interface_list))]
+        with self:
+            if title:
+                gr.Markdown(
+                    f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
+                )
+            if description:
+                gr.Markdown(description)
+            with gr.Tabs():
+                for interface, tab_name in zip(interface_list, tab_names):
+                    with gr.Tab(label=tab_name):
+                        interface.render()
 def launch():
     global demo, llm, DEBUG, LOG_FILE
     model_desc = MODEL_DESC
     max_tokens = MAX_TOKENS
     temperature = TEMPERATURE
     frequence_penalty = FREQUENCE_PENALTY
+    presence_penalty = PRESENCE_PENALTY
     ckpt_info = "None"
     print(
         f'\n| DISPLAY_MODEL_PATH={DISPLAY_MODEL_PATH} '
         f'\n| LANG_BLOCK_HISTORY={LANG_BLOCK_HISTORY} '
         f'\n| frequence_penalty={frequence_penalty} '
+        f'\n| presence_penalty={presence_penalty} '
         f'\n| temperature={temperature} '
         f'\n| hf_model_name={hf_model_name} '
         f'\n| model_path={model_path} '
         if SAVE_LOGS:
             LOG_FILE = open(LOG_PATH, 'a', encoding='utf-8')
+    if ENABLE_BATCH_INFER:
+        demo_file = gr.Interface(
+            batch_inference,
+            inputs=[
+                gr.File(file_count='single', file_types=['json']),
+                gr.Radio(["chat", "few-shot"], value='chat', label="Chat or Few-shot mode", info="Chat's output more user-friendly, Few-shot's output more consistent with few-shot patterns."),
+                gr.Number(value=temperature, label='Temperature (higher -> more random)'),
+                gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+                gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
+                gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
+                gr.Textbox(value="[STOP],[END],<s>,</s>", label='Comma-separated STOP string to stop generation only in few-shot mode', lines=1),
+                gr.Number(value=0, label='current_time', visible=False),
             ],
+            outputs=[
+                # "file",
+                gr.File(label="Generated file"),
+                # gr.Textbox(),
+                # "json"
+                gr.JSON(label='Example outputs (max 2 samples)')
+            ],
+            # examples=[[[os.path.join(os.path.dirname(__file__),"files/titanic.csv"),
+            # os.path.join(os.path.dirname(__file__),"files/titanic.csv"),
+            # os.path.join(os.path.dirname(__file__),"files/titanic.csv")]]],
+            # cache_examples=True
+            description=FILE_UPLOAD_DESCRIPTION
+        )
+        demo_chat = gr.ChatInterface(
+            response_fn,
+            chatbot=ChatBot(
+                label=MODEL_NAME,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=8, max_lines=128, min_width=200),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            # ! consider preventing the stop button
+            # stop_btn=None,
+            # title=f"{model_title}",
+            # description=f"{model_desc}",
+            additional_inputs=[
+                gr.Number(value=temperature, label='Temperature (higher -> more random)'),
+                gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+                gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
+                gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
+                gr.Number(value=0, label='current_time', visible=False),
+                # ! Remove the system prompt textbox to avoid jailbreaking
+                # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
+            ],
+        )
+        demo = CusTabbedInterface(
+            interface_list=[demo_chat, demo_file],
+            tab_names=["Chat Interface", "Batch Inference"],
+            title=f"{model_title}",
+            description=f"{model_desc}",
+        )
+        demo.title = MODEL_NAME
+        with demo:
+            gr.Markdown(cite_markdown)
+            if DISPLAY_MODEL_PATH:
+                gr.Markdown(path_markdown.format(model_path=model_path))
+            if ENABLE_AGREE_POPUP:
+                demo.load(None, None, None, _js=AGREE_POP_SCRIPTS)
+        demo.queue()
+        demo.launch(server_port=PORT)
+    else:
+        demo = gr.ChatInterface(
+            response_fn,
+            chatbot=ChatBot(
+                label=MODEL_NAME,
+                bubble_full_width=False,
+                latex_delimiters=[
+                    { "left": "$", "right": "$", "display": False},
+                    { "left": "$$", "right": "$$", "display": True},
+                ],
+                show_copy_button=True,
+            ),
+            textbox=gr.Textbox(placeholder='Type message', lines=8, max_lines=128, min_width=200),
+            submit_btn=gr.Button(value='Submit', variant="primary", scale=0),
+            # ! consider preventing the stop button
+            # stop_btn=None,
+            title=f"{model_title}",
+            description=f"{model_desc}",
+            additional_inputs=[
+                gr.Number(value=temperature, label='Temperature (higher -> more random)'),
+                gr.Number(value=max_tokens, label='Max generated tokens (increase if want more generation)'),
+                gr.Number(value=frequence_penalty, label='Frequency penalty (> 0 encourage new tokens over repeated tokens)'),
+                gr.Number(value=presence_penalty, label='Presence penalty (> 0 encourage new tokens, < 0 encourage existing tokens)'),
+                gr.Number(value=0, label='current_time', visible=False),
+                # ! Remove the system prompt textbox to avoid jailbreaking
+                # gr.Textbox(value=sys_prompt, label='System prompt', lines=8)
+            ],
+        )
+        demo.title = MODEL_NAME
+        with demo:
+            gr.Markdown(cite_markdown)
+            if DISPLAY_MODEL_PATH:
+                gr.Markdown(path_markdown.format(model_path=model_path))
+            if ENABLE_AGREE_POPUP:
+                demo.load(None, None, None, _js=AGREE_POP_SCRIPTS)
+        demo.queue()
+        demo.launch(server_port=PORT)
 def main():
 if __name__ == "__main__":
+    main()