Spaces:
Running
Running
File size: 4,468 Bytes
f8f9857 8fba4df f234576 37a6f17 f8f9857 8c44a86 f8f9857 502727a 3b46104 2ddacc1 f8f9857 db111cc 3b46104 f8f9857 2ddacc1 b33166a 48b4e30 bc659ca 48b4e30 8deb142 5271c72 8deb142 0edb841 d2652c3 b33166a 401ed7b b33166a 7960b44 e0f5396 2dc9b40 0f8e818 b81c4dd b33166a 48b4e30 c5c5495 b33166a 267d0d5 b33166a 267d0d5 b33166a 267d0d5 b33166a 3bb24fe 9818cdf befbf2f f8f9857 9ba5511 b33166a f8f9857 d2a0386 e0f5396 3b46104 d2a0386 2dc9b40 8deb142 b33166a f8f9857 b33166a 502727a e6fd55c b33166a b81c4dd 43874bd 1077094 3b46104 00f1945 b33166a fd4ffc9 bf1bb7a f15787f e0f5396 b33166a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# ADD DISCLAIMERS
# why slow? I'm not made of gold
import os
os.system('pip install llama-cpp-python transformers torch')
import gradio as gr
from llama_cpp import Llama
from transformers import AutoTokenizer
from huggingface_hub import upload_file
import json
from uuid import uuid4
model_id = "Elijahbodden/eliGPTv1.3"
# MODEL
model = Llama.from_pretrained(
repo_id=model_id,
filename="model.gguf",
verbose=True,
n_threads = 2,
n_threads_batch = 2,
n_ctx=8192,
)
# TOKENIZER AND TEMPLATE
tokenizer = AutoTokenizer.from_pretrained(model_id)
presets = {
"Default" : [{"role": "user", "content": "good convo, bye"}, {"role": "assistant", "content": "Haha cool ttyl"}],
"Rizz ????" : [{"role": "user", "content": "omg it's so hot when you flirt with me"}, {"role": "assistant", "content": "haha well you're lucky can even string a sentence together, the way you take my breath away π"}, {"role": "user", "content": "alright love you, gn!"}, {"role": "assistant", "content": "ttyl babe π"}],
"Thinky" : [{"role": "user", "content": "Woah you just totally blew my mind\ngehh now the fermi paradox is going to be bugging me 24/7\nok ttyl"}, {"role": "assistant", "content": "nah our deep convos are always the best, we should talk again soon\nttyl"}],
}
def custom_lp_logits_processor(ids, logits, lp_start, lp_decay, prompt_tok_len):
generated_tok_number = len(ids) - prompt_tok_len
if (generated_tok_number > lp_start):
print(len(ids), lp_start, pow(lp_decay, len(ids)-lp_start))
logits[tokenizer.eos_token_id] *= pow(lp_decay, generated_tok_number-lp_start)
return logits
def respond(
message,
history: list[tuple[str, str]],
preset,
temperature,
min_p,
lp_start,
lp_decay,
frequency_penalty,
presence_penalty,
max_tokens
):
messages = presets[preset].copy()
for val in history:
if val[0]:
messages.append({"from": "user", "value": val[0]})
if val[1]:
messages.append({"from": "assistant", "value": val[1]})
messages.append({"from": "user", "value": message})
response = ""
print(tokenizer.apply_chat_template(messages, tokenize=False))
convo = tokenizer.apply_chat_template(messages, tokenize=True)
for message in model.create_completion(
convo,
temperature=temperature,
stream=True,
stop=["<|im_end|>"],
min_p=min_p,
max_tokens=max_tokens,
# Disable top-k pruning
top_k=100000000,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
logits_processor=lambda ids, logits: custom_lp_logits_processor(ids, logits, lp_start, lp_decay, len(convo))
):
token = message["choices"][0]["text"]
response += token
yield response
demo = gr.ChatInterface(
respond,
additional_inputs_accordion=gr.Accordion(label="Options", open=True),
css=".bubble-gap {gap: 6px !important}",
theme="shivi/calm_seafoam",
description="The model may be slow if it hasn't run recently or a lot of people are using it",
title="EliGPT v1.3",
additional_inputs=[
gr.Radio(presets.keys(), label="Personality preset", info="VERY SLIGHTLY influence the model's personality [WARNING, IF YOU CHANGE THIS WHILE THERE ARE MESSAGES IN THE CHAT, THE MODEL WILL BECOME VERY SLOW]", value="Default"),
gr.Slider(minimum=0.1, maximum=4.0, value=1.5, step=0.1, label="Temperature", info="How chaotic should the model be?"),
gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="Min_p", info="Lower values give it more \"personality\""),
gr.Slider(minimum=0, maximum=512, value=5, step=1, label="Length penalty start", info='When should the model start being more likely to shut up?'),
gr.Slider(minimum=0.5, maximum=1.5, value=1.01, step=0.001, label="Length penalty decay factor", info='How fast should that stop likelihood increase?'),
gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="Frequency penalty", info='"Don\'repeat yourself"'),
gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="Presence penalty", info='"Use lots of diverse words"'),
gr.Slider(minimum=1, maximum=1024, value=1024, step=1, label="Max new tokens", info="How many words can the model generate at most?"),
],
)
if __name__ == "__main__":
demo.launch() |