llm_server / app.py
ldhldh's picture
Update app.py
99fc69e
raw
history blame
1.87 kB
from threading import Thread
import gradio as gr
import inspect
from gradio import routes
from typing import List, Type
from transformers import AutoTokenizer
loop = asyncio.get_event_loop()
# init code
def get_types(cls_set: List[Type], component: str):
docset = []
types = []
if component == "input":
for cls in cls_set:
doc = inspect.getdoc(cls)
doc_lines = doc.split("\n")
docset.append(doc_lines[1].split(":")[-1])
types.append(doc_lines[1].split(")")[0].split("(")[-1])
else:
for cls in cls_set:
doc = inspect.getdoc(cls)
doc_lines = doc.split("\n")
docset.append(doc_lines[-1].split(":")[-1])
types.append(doc_lines[-1].split(")")[0].split("(")[-1])
return docset, types
routes.get_types = get_types
from petals import AutoDistributedModelForCausalLM
import npc_data
# Choose any model available at https://health.petals.dev
model_name = "daekeun-ml/Llama-2-ko-instruct-13B"
#daekeun-ml/Llama-2-ko-instruct-13B
#quantumaikr/llama-2-70b-fb16-korean
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoDistributedModelForCausalLM.from_pretrained(model_name)
# Run the model as if it were on your computer
def chat2(id, npc, text):
prom = ""
inputs = tokenizer(prom, return_tensors="pt")["input_ids"]
outputs = model.generate(inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))
return text
def chat(id, npc, text):
return f"{text}에 λŒ€ν•œ {npc}의 응닡"
with gr.Blocks() as demo:
count = 0
aa = gr.Interface(
fn=chat,
inputs=["text","text","text"],
outputs="text",
description="chat, ai 응닡을 λ°˜ν™˜ν•©λ‹ˆλ‹€. λ‚΄λΆ€μ μœΌλ‘œ νŠΈλžœμž­μ…˜ 생성. \n /run/predict",
)
demo.queue(max_size=32).launch(enable_queue=True)