Spaces:
Runtime error
Runtime error
File size: 5,554 Bytes
4431147 8f8b6ee 4431147 8f8b6ee 4431147 8f8b6ee 983c445 8f8b6ee 4431147 8f8b6ee 4431147 8f8b6ee 4431147 997e24d 4431147 997e24d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import os
from threading import Thread
from typing import Iterator, List, Tuple
import torch
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gradio as gr
from gradio import Blocks
from transformers import TextIteratorStreamer
# Load the base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(
'NousResearch/Llama-2-7b-chat-hf',
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf')
# Load the finetuned model
model = PeftModel.from_pretrained(base_model, 'FinGPT/fingpt-forecaster_dow30_llama2-7b_lora')
model = model.eval()
# Define constants
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
# FastAPI setup
app = FastAPI()
class ChatRequest(BaseModel):
message: str
chat_history: List[Tuple[str, str]] = []
system_prompt: str = ""
max_new_tokens: int = 1024
temperature: float = 0.6
top_p: float = 0.9
top_k: int = 50
repetition_penalty: float = 1.2
@app.post("/chat/")
async def chat(request: ChatRequest):
try:
response = await generate_response(
request.message,
request.chat_history,
request.system_prompt,
request.max_new_tokens,
request.temperature,
request.top_p,
request.top_k,
request.repetition_penalty
)
return {"response": response}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
async def generate_response(
message: str,
chat_history: List[Tuple[str, str]],
system_prompt: str,
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> str:
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = {
"input_ids": input_ids,
"streamer": streamer,
"max_new_tokens": max_new_tokens,
"do_sample": True,
"top_p": top_p,
"top_k": top_k,
"temperature": temperature,
"num_beams": 1,
"repetition_penalty": repetition_penalty,
}
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
return "".join(outputs)
# Gradio setup
def generate(
message: str,
chat_history: List[Tuple[str, str]],
system_prompt: str,
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
return generate_response(
message,
chat_history,
system_prompt,
max_new_tokens,
temperature,
top_p,
top_k,
repetition_penalty
)
chat_interface = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Textbox(label="System prompt", lines=6),
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
),
gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.6,
),
gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1.2,
),
],
stop_btn=None,
examples=[
["Hello there! How are you doing?"],
["Can you explain briefly to me what is the Python programming language?"],
["Explain the plot of Cinderella in a sentence."],
["How many hours does it take a man to eat a Helicopter?"],
["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
],
)
with Blocks(css="style.css") as demo:
gr.Markdown("# Llama-2 7B Chat")
gr.Markdown("""
This Space demonstrates the Llama-2 7B Chat model by Meta, fine-tuned for chat instructions.
Feel free to chat with the model here or use the API to integrate it into your applications.
""")
chat_interface.render()
gr.Markdown("---")
gr.Markdown("This demo is governed by the original [license](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/LICENSE.txt).")
if __name__ == "__main__":
demo.queue(max_size=20).launch()
|