Spaces:
Running
Running
File size: 5,547 Bytes
4a2c956 e63ee0a 7c790c0 61b9ff7 59618c7 4643bb5 f092f6f cf1cc65 ab491fe 59618c7 4a2c956 7c790c0 59618c7 f787cc5 42e5f71 f787cc5 42e5f71 f787cc5 59618c7 7c790c0 cf1cc65 59618c7 7c790c0 a00d592 7c790c0 59618c7 bd8572f e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 2d1a42e e63ee0a 59618c7 e63ee0a 2d1a42e e63ee0a 7c790c0 6d8da18 7c790c0 e63ee0a 8105dac 7c790c0 ea2eccb 7c790c0 0886a44 7c790c0 e63ee0a 0b1be7c 29c4970 0083156 0b1be7c 7c790c0 a4c73f6 7c790c0 e63ee0a 59618c7 e63ee0a a00d592 d93e535 0abab0b e63ee0a d629da6 a00d592 8ecee14 69f64d5 8ecee14 e63ee0a 8105dac e5f4626 2d1a42e e63ee0a 2d1a42e 7c790c0 59618c7 88b3962 e7333b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import gradio as gr
import os, gc, copy, torch
from datetime import datetime
from huggingface_hub import hf_hub_download
from pynvml import *
# Flag to check if GPU is present
HAS_GPU = False
# Model title and context size limit
ctx_limit = 2000
title = "RWKV-5-World-1B5-v2-20231025-ctx4096"
model_file = "rwkv-5-h-world-1b5"
# Get the GPU count
try:
nvmlInit()
GPU_COUNT = nvmlDeviceGetCount()
if GPU_COUNT > 0:
HAS_GPU = True
gpu_h = nvmlDeviceGetHandleByIndex(0)
except NVMLError as error:
print(error)
os.environ["RWKV_JIT_ON"] = '1'
# Model strat to use
MODEL_STRAT="cpu bf16"
os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
# Switch to GPU mode
if HAS_GPU == True :
os.environ["RWKV_CUDA_ON"] = '1'
MODEL_STRAT = "cuda bf16"
# Load the model accordingly
from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="a686d380/rwkv-5-h-world", filename=f"{model_file}.pth")
model = RWKV(model=model_path, strategy=MODEL_STRAT)
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
# Prompt generation
def generate_prompt(instruction, input=""):
instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
if input:
return f"""Instruction: {instruction}
Input: {input}
Response:"""
else:
return f"""User: hi
Assistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.
User: {instruction}
Assistant:"""
# Evaluation logic
def evaluate(
ctx,
token_count=200,
temperature=1.0,
top_p=0.7,
presencePenalty = 0.1,
countPenalty = 0.1,
):
print(ctx)
args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
alpha_frequency = countPenalty,
alpha_presence = presencePenalty,
token_ban = [], # ban the generation of some tokens
token_stop = [0]) # stop generation whenever you see any token here
ctx = ctx.strip()
all_tokens = []
out_last = 0
out_str = ''
occurrence = {}
state = None
for i in range(int(token_count)):
out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
for n in occurrence:
out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
if token in args.token_stop:
break
all_tokens += [token]
for xxx in occurrence:
occurrence[xxx] *= 0.996
if token not in occurrence:
occurrence[token] = 1
else:
occurrence[token] += 1
tmp = pipeline.decode(all_tokens[out_last:])
if '\ufffd' not in tmp:
out_str += tmp
yield out_str.strip()
out_last = i + 1
if HAS_GPU == True :
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
del out
del state
gc.collect()
if HAS_GPU == True :
torch.cuda.empty_cache()
yield out_str.strip()
##########################################################################
# Gradio blocks
with gr.Blocks(title=title) as demo:
gr.HTML(f"<div style=\"text-align: center;\">\n<h1>RWKV-5 World v2 - {title}</h1>\n</div>")
with gr.Tab("Raw Generation"):
gr.Markdown(f"This is RWKV-5 World v2 with 1.5B params - a 100% attention-free RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). Supports all 100+ world languages and code. And we have [200+ Github RWKV projects](https://github.com/search?o=desc&p=1&q=rwkv&s=updated&type=Repositories). *** Please try examples first (bottom of page) *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}.")
with gr.Row():
with gr.Column():
prompt = gr.Textbox(lines=2, label="Prompt", value="裸体少女")
token_count = gr.Slider(10, 300, label="Max Tokens", step=10, value=200)
temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.3)
presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=1)
count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=1)
with gr.Column():
with gr.Row():
submit = gr.Button("Submit", variant="primary")
clear = gr.Button("Clear", variant="secondary")
output = gr.Textbox(label="Output", lines=5)
data = gr.Dataset(components=[prompt, token_count, temperature, top_p, presence_penalty, count_penalty], label="Example Instructions", headers=["Prompt", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
submit.click(evaluate, [prompt, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
clear.click(lambda: None, [], [output])
data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])
# Gradio launch
demo.queue(concurrency_count=1, max_size=10)
demo.launch(share=False)
|