rwkv-v5-1b5-cpu

Running

File size: 5,841 Bytes

4a2c956
e63ee0a
7c790c0
 
61b9ff7
 
 
4643bb5
a00d592
4a2c956
7c790c0
 
 
 
a00d592
 
7c790c0
a00d592
7c790c0
bd8572f
e63ee0a
 
 
2d1a42e
e63ee0a
2d1a42e
e63ee0a
2d1a42e
e63ee0a
2d1a42e
 
 
e63ee0a
2d1a42e
e63ee0a
2d1a42e
e63ee0a
 
2d1a42e
e63ee0a
 
 
 
 
7c790c0
 
 
 
e63ee0a
 
8105dac
7c790c0
 
 
 
 
 
ea2eccb
7c790c0
 
 
 
 
 
 
0886a44
 
7c790c0
 
 
 
 
 
 
 
 
 
e63ee0a
 
 
29c4970
 
0083156
 
7c790c0
 
 
8105dac
 
 
 
761f3c0
3498824
8105dac
 
7c790c0
 
e63ee0a
 
 
a00d592
d93e535
 
e63ee0a
 
3d38c26
8105dac
a00d592
e63ee0a
 
 
 
 
 
 
8105dac
2d1a42e
 
e63ee0a
2d1a42e
7c790c0
e63ee0a
7c790c0

import gradio as gr
import os, gc, copy, torch
from datetime import datetime
from huggingface_hub import hf_hub_download
from pynvml import *
nvmlInit()
gpu_h = nvmlDeviceGetHandleByIndex(0)
ctx_limit = 2000
title = "RWKV-5-World-1.5B-v2-OnlyForTest_56%_trained-20231013-ctx4096"

os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)

from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="BlinkDL/temp", filename=f"{title}.pth")
model = RWKV(model=model_path, strategy='cuda fp16')
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")

def generate_prompt(instruction, input=""):
    instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
    input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
    if input:
        return f"""Instruction: {instruction}

Input: {input}

Response:"""
    else:
        return f"""User: hi

Assistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.

User: {instruction}

Assistant:"""

def evaluate(
    ctx,
    token_count=200,
    temperature=1.0,
    top_p=0.7,
    presencePenalty = 0.1,
    countPenalty = 0.1,
):
    args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
                     alpha_frequency = countPenalty,
                     alpha_presence = presencePenalty,
                     token_ban = [], # ban the generation of some tokens
                     token_stop = [0]) # stop generation whenever you see any token here
    ctx = ctx.strip()
    all_tokens = []
    out_last = 0
    out_str = ''
    occurrence = {}
    state = None
    for i in range(int(token_count)):
        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
        for n in occurrence:
            out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)

        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
        if token in args.token_stop:
            break
        all_tokens += [token]
        for xxx in occurrence:
            occurrence[xxx] *= 0.996        
        if token not in occurrence:
            occurrence[token] = 1
        else:
            occurrence[token] += 1
        
        tmp = pipeline.decode(all_tokens[out_last:])
        if '\ufffd' not in tmp:
            out_str += tmp
            yield out_str.strip()
            out_last = i + 1

    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
    print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')  
    del out
    del state
    gc.collect()
    torch.cuda.empty_cache()
    yield out_str.strip()

examples = [
    [generate_prompt("Tell me about ravens."), 333, 1, 0.5, 0.4, 0.4],
    [generate_prompt("Écrivez un programme Python pour miner 1 Bitcoin."), 333, 1, 0.5, 0.4, 0.4],
    [generate_prompt("東京で訪れるべき素晴らしい場所とその紹介をいくつか挙げてください。"), 333, 1, 0.5, 0.4, 0.4],
    [generate_prompt("Write a story using the following information", "A man named Alex chops a tree down"), 333, 1, 0.5, 0.4, 0.4],
    ["Assistant: Here is a list of adjectives that describe a person as brave:", 333, 1, 0.5, 0.4, 0.4],
    ["Assistant: Here is my proposal to kill all mosquitoes.", 333, 1, 0.5, 0.4, 0.4],
    [generate_prompt("写一篇关于水利工程的流体力学模型的论文，需要详细全面。"), 333, 1, 0.5, 0.4, 0.4],
    [generate_prompt("You have $100, and your goal is to turn that into as much money as possible with AI and Machine Learning. Please respond with detailed plan."), 333, 1, 0.5, 0.4, 0.4],
]

##########################################################################

with gr.Blocks(title=title) as demo:
    gr.HTML(f"<div style=\"text-align: center;\">\n<h1>RWKV-5 World v2 - {title}</h1>\n</div>")
    with gr.Tab("Raw Generation"):
        gr.Markdown(f"This is [RWKV-5 World v2](https://huggingface.co/BlinkDL/rwkv-5-world) with 1.5B params - a 100% attention-free RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). *** Please try examples first (bottom of page) *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}.")
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(lines=2, label="Prompt", value=generate_prompt("Tell me about ravens."))
                token_count = gr.Slider(10, 333, label="Max Tokens", step=10, value=333)
                temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
                top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.5)
                presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0.4)
                count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=0.4)
            with gr.Column():
                with gr.Row():
                    submit = gr.Button("Submit", variant="primary")
                    clear = gr.Button("Clear", variant="secondary")
                output = gr.Textbox(label="Output", lines=5)
        data = gr.Dataset(components=[prompt, token_count, temperature, top_p, presence_penalty, count_penalty], samples=examples, label="Example Instructions", headers=["Prompt", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
        submit.click(evaluate, [prompt, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
        clear.click(lambda: None, [], [output])
        data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])

demo.queue(concurrency_count=1, max_size=10)
demo.launch(share=False)