File size: 4,182 Bytes
d75eb4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# This space is mostly a copy of the work of Aritra Roy Gosthipaty (see https://huggingface.co/spaces/ariG23498/kv-press/blob/main/app.py)
import spaces
import requests
import gradio as gr
from bs4 import BeautifulSoup
from transformers import pipeline
from kvpress import (
ExpectedAttentionPress,
KnormPress,
RandomPress,
SnapKVPress,
StreamingLLMPress,
TOVAPress,
)
press_dict = {
"ExpectedAttentionPress": ExpectedAttentionPress,
"KnormPress": KnormPress,
"RandomPress": RandomPress,
"SnapKVPress": SnapKVPress,
"StreamingLLMPress": StreamingLLMPress,
"TOVAPress": TOVAPress,
}
@spaces.GPU
def process_request(url, question, press_name, compression_ratio):
""" """
if press_name not in press_dict:
return f"Invalid press type selected: {press_name}", -1
# Fetch the Wikipedia article
try:
content = requests.get(url).content
except requests.exceptions.RequestException as e:
return f"Error fetching the Wikipedia article: {str(e)}", -1
try:
# Parse the Wikipedia HTML
soup = BeautifulSoup(content, "html.parser")
context = "".join([p.text for p in soup.find_all("p")]) + "\n\n"
# Initialize the press
press = press_dict[press_name](compression_ratio)
num_tokens = pipe.tokenizer(context, return_tensors="pt")["input_ids"].shape[1]
pred_answer = pipe(context, question=question, press=press)["answer"]
return pred_answer, num_tokens
except Exception as e:
if "CUDA out of memory" in str(e):
return "Error: CUDA out of memory. Try using a smaller article or a lower compression ratio.", -1
else:
return str(e), -1
def gradio_interface():
with gr.Blocks() as demo:
gr.Markdown(
"""
# Wikipedia Article Question Answering with kvpress
This demo uses the llama 3.1 8B Instruct model to answer questions about any given Wikipedia article.
Under the hood, [kvpress](https://github.com/NVIDIA/kvpress) *compresses the key-value (KV) cache* associated with the article, helping reduce memory usage and accelerate decoding.
**How to use:**
1. Enter a Wikipedia article URL
2. Type your question
3. Select a press type and the desired compression ratio
4. Press "Submit" to see the answer, along with token statistics before and after compression
"""
)
with gr.Row():
url_input = gr.Textbox(label="Wikipedia Article URL", placeholder="Enter the Wikipedia article URL here")
question_input = gr.Textbox(label="Question", placeholder="Type your question here")
with gr.Row():
press_selector = gr.Dropdown(
choices=list(press_dict.keys()),
value="ExpectedAttentionPress",
label="Select Press Type",
)
compression_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Compression Ratio")
output = gr.Textbox(label="Output", lines=10)
output_num_tokens = gr.Number(label="Number of Tokens", interactive=False)
submit_button = gr.Button("Submit")
gr.Examples(
examples=[
[
"https://en.wikipedia.org/wiki/Nvidia",
"Complete this sentence: The Nvidia GeForce Partner Program was a ...",
"ExpectedAttentionPress",
0.5,
],
],
inputs=[url_input, question_input, press_selector, compression_slider],
)
submit_button.click(
process_request,
inputs=[url_input, question_input, press_selector, compression_slider],
outputs=[output, output_num_tokens],
)
return demo
if __name__ == "__main__":
# Load pipeline
device = "cuda:0"
ckpt = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipe = pipeline("kv-press-text-generation", model=ckpt, device=device, torch_dtype="auto")
# Launch demo
demo = gradio_interface()
demo.launch()
|