Spaces:
Running
on
Zero
Running
on
Zero
NGUYEN, Xuan Phi
commited on
Commit
•
f028d50
1
Parent(s):
203c3cd
update
Browse files
app.py
CHANGED
@@ -10,43 +10,45 @@ tensor_parallel must == 1
|
|
10 |
|
11 |
"""
|
12 |
|
13 |
-
|
14 |
import os
|
15 |
import numpy as np
|
16 |
import argparse
|
17 |
-
|
18 |
import gradio as gr
|
19 |
-
from gradio_client.documentation import document, set_documentation_group
|
20 |
-
|
21 |
-
from typing import List, Optional, Union, Dict, Tuple
|
22 |
-
|
23 |
-
from tqdm import tqdm
|
24 |
-
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
25 |
-
|
26 |
-
from vllm.engine.arg_utils import EngineArgs
|
27 |
-
from vllm.engine.llm_engine import LLMEngine
|
28 |
-
from vllm.outputs import RequestOutput
|
29 |
-
from vllm.sampling_params import SamplingParams
|
30 |
-
from vllm.utils import Counter
|
31 |
-
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
32 |
-
SequenceGroupMetadata, SequenceOutputs,
|
33 |
-
SequenceStatus)
|
34 |
-
|
35 |
-
# ! reconfigure vllm to faster llama
|
36 |
from typing import Any, Iterator
|
37 |
from typing import Iterator, List, Optional, Tuple
|
38 |
import filelock
|
39 |
import glob
|
40 |
import json
|
41 |
-
import os
|
42 |
-
from huggingface_hub import snapshot_download
|
43 |
|
|
|
|
|
|
|
44 |
from tqdm.auto import tqdm
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
from vllm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
|
|
50 |
|
51 |
|
52 |
def hf_model_weights_iterator(
|
@@ -661,18 +663,35 @@ def debug_chat_response_echo(
|
|
661 |
yield message
|
662 |
|
663 |
|
|
|
664 |
MODEL_TITLE = "DAMO-SeaL-13B - An Assistant for South East Asian Languages"
|
665 |
MODEL_DESC = """
|
666 |
This is a 13B DAMO-SeaL-Chat assistant model built by DAMO Academy, Alibaba Group. It can produce helpful responses in English, Vietnamese, Indonesian and Thai.
|
667 |
""".strip()
|
668 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
669 |
TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
|
670 |
DTYPE = 'bfloat16'
|
671 |
DTYPE = 'float16'
|
672 |
|
673 |
MODEL_PATH = os.environ.get("MODEL_PATH", "notfound, please set `export MODEL_PATH=`")
|
674 |
|
675 |
-
|
|
|
676 |
|
677 |
def launch():
|
678 |
global demo, llm, DEBUG
|
@@ -720,6 +739,8 @@ def launch():
|
|
720 |
gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
|
721 |
gr.Textbox(value=sys_prompt, label='System prompt', lines=8)],
|
722 |
)
|
|
|
|
|
723 |
demo.queue()
|
724 |
# demo.launch(server_port=args.port)
|
725 |
demo.launch()
|
|
|
10 |
|
11 |
"""
|
12 |
|
13 |
+
|
14 |
import os
|
15 |
import numpy as np
|
16 |
import argparse
|
17 |
+
import torch
|
18 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
from typing import Any, Iterator
|
20 |
from typing import Iterator, List, Optional, Tuple
|
21 |
import filelock
|
22 |
import glob
|
23 |
import json
|
|
|
|
|
24 |
|
25 |
+
from gradio_client.documentation import document, set_documentation_group
|
26 |
+
|
27 |
+
from typing import List, Optional, Union, Dict, Tuple
|
28 |
from tqdm.auto import tqdm
|
29 |
+
from huggingface_hub import snapshot_download
|
30 |
+
|
31 |
+
DEBUG = True
|
32 |
+
|
33 |
+
if not DEBUG:
|
34 |
|
35 |
+
# vllm import
|
36 |
+
from vllm import LLM, SamplingParams
|
37 |
+
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
38 |
+
from vllm.engine.arg_utils import EngineArgs
|
39 |
+
from vllm.engine.llm_engine import LLMEngine
|
40 |
+
from vllm.outputs import RequestOutput
|
41 |
+
from vllm.sampling_params import SamplingParams
|
42 |
+
from vllm.utils import Counter
|
43 |
+
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
44 |
+
SequenceGroupMetadata, SequenceOutputs,
|
45 |
+
SequenceStatus)
|
46 |
+
# ! reconfigure vllm to faster llama
|
47 |
+
from vllm.model_executor.model_loader import _MODEL_REGISTRY
|
48 |
+
from vllm.model_executor.models import LlamaForCausalLM
|
49 |
|
50 |
+
|
51 |
+
_MODEL_REGISTRY['FasterLlamaForCausalLM'] = LlamaForCausalLM
|
52 |
|
53 |
|
54 |
def hf_model_weights_iterator(
|
|
|
663 |
yield message
|
664 |
|
665 |
|
666 |
+
# ============ CONSTANT ============
|
667 |
MODEL_TITLE = "DAMO-SeaL-13B - An Assistant for South East Asian Languages"
|
668 |
MODEL_DESC = """
|
669 |
This is a 13B DAMO-SeaL-Chat assistant model built by DAMO Academy, Alibaba Group. It can produce helpful responses in English, Vietnamese, Indonesian and Thai.
|
670 |
""".strip()
|
671 |
|
672 |
+
|
673 |
+
cite_markdown = """
|
674 |
+
## Citation
|
675 |
+
If you find our project useful, hope you can star our repo and cite our paper as follows:
|
676 |
+
```
|
677 |
+
@article{damonlpsg2023seallm,
|
678 |
+
author = {???},
|
679 |
+
title = {SeaL: A language model for South East Asian Languages},
|
680 |
+
year = 2023,
|
681 |
+
}
|
682 |
+
"""
|
683 |
+
# journal = {arXiv preprint arXiv:2306.02858}
|
684 |
+
# url = {https://arxiv.org/abs/2306.02858}
|
685 |
+
|
686 |
+
|
687 |
TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1"))
|
688 |
DTYPE = 'bfloat16'
|
689 |
DTYPE = 'float16'
|
690 |
|
691 |
MODEL_PATH = os.environ.get("MODEL_PATH", "notfound, please set `export MODEL_PATH=`")
|
692 |
|
693 |
+
|
694 |
+
|
695 |
|
696 |
def launch():
|
697 |
global demo, llm, DEBUG
|
|
|
739 |
gr.Number(value=0.4, label='Frequency penalty (> 0 encourage new tokens)'),
|
740 |
gr.Textbox(value=sys_prompt, label='System prompt', lines=8)],
|
741 |
)
|
742 |
+
|
743 |
+
gr.Markdown(cite_markdown)
|
744 |
demo.queue()
|
745 |
# demo.launch(server_port=args.port)
|
746 |
demo.launch()
|