simjeg commited on
Commit
d75eb4f
·
1 Parent(s): ebdd53e
Files changed (2) hide show
  1. app.py +121 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This space is mostly a copy of the work of Aritra Roy Gosthipaty (see https://huggingface.co/spaces/ariG23498/kv-press/blob/main/app.py)
2
+
3
+ import spaces
4
+ import requests
5
+ import gradio as gr
6
+ from bs4 import BeautifulSoup
7
+ from transformers import pipeline
8
+
9
+ from kvpress import (
10
+ ExpectedAttentionPress,
11
+ KnormPress,
12
+ RandomPress,
13
+ SnapKVPress,
14
+ StreamingLLMPress,
15
+ TOVAPress,
16
+ )
17
+
18
+ press_dict = {
19
+ "ExpectedAttentionPress": ExpectedAttentionPress,
20
+ "KnormPress": KnormPress,
21
+ "RandomPress": RandomPress,
22
+ "SnapKVPress": SnapKVPress,
23
+ "StreamingLLMPress": StreamingLLMPress,
24
+ "TOVAPress": TOVAPress,
25
+ }
26
+
27
+
28
+ @spaces.GPU
29
+ def process_request(url, question, press_name, compression_ratio):
30
+ """ """
31
+
32
+ if press_name not in press_dict:
33
+ return f"Invalid press type selected: {press_name}", -1
34
+
35
+ # Fetch the Wikipedia article
36
+ try:
37
+ content = requests.get(url).content
38
+ except requests.exceptions.RequestException as e:
39
+ return f"Error fetching the Wikipedia article: {str(e)}", -1
40
+
41
+ try:
42
+ # Parse the Wikipedia HTML
43
+ soup = BeautifulSoup(content, "html.parser")
44
+ context = "".join([p.text for p in soup.find_all("p")]) + "\n\n"
45
+
46
+ # Initialize the press
47
+ press = press_dict[press_name](compression_ratio)
48
+ num_tokens = pipe.tokenizer(context, return_tensors="pt")["input_ids"].shape[1]
49
+ pred_answer = pipe(context, question=question, press=press)["answer"]
50
+
51
+ return pred_answer, num_tokens
52
+ except Exception as e:
53
+ if "CUDA out of memory" in str(e):
54
+ return "Error: CUDA out of memory. Try using a smaller article or a lower compression ratio.", -1
55
+ else:
56
+ return str(e), -1
57
+
58
+
59
+ def gradio_interface():
60
+ with gr.Blocks() as demo:
61
+ gr.Markdown(
62
+ """
63
+ # Wikipedia Article Question Answering with kvpress
64
+ This demo uses the llama 3.1 8B Instruct model to answer questions about any given Wikipedia article.
65
+ Under the hood, [kvpress](https://github.com/NVIDIA/kvpress) *compresses the key-value (KV) cache* associated with the article, helping reduce memory usage and accelerate decoding.
66
+ **How to use:**
67
+ 1. Enter a Wikipedia article URL
68
+ 2. Type your question
69
+ 3. Select a press type and the desired compression ratio
70
+ 4. Press "Submit" to see the answer, along with token statistics before and after compression
71
+ """
72
+ )
73
+
74
+ with gr.Row():
75
+ url_input = gr.Textbox(label="Wikipedia Article URL", placeholder="Enter the Wikipedia article URL here")
76
+ question_input = gr.Textbox(label="Question", placeholder="Type your question here")
77
+
78
+ with gr.Row():
79
+ press_selector = gr.Dropdown(
80
+ choices=list(press_dict.keys()),
81
+ value="ExpectedAttentionPress",
82
+ label="Select Press Type",
83
+ )
84
+ compression_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Compression Ratio")
85
+
86
+ output = gr.Textbox(label="Output", lines=10)
87
+ output_num_tokens = gr.Number(label="Number of Tokens", interactive=False)
88
+
89
+ submit_button = gr.Button("Submit")
90
+
91
+ gr.Examples(
92
+ examples=[
93
+ [
94
+ "https://en.wikipedia.org/wiki/Nvidia",
95
+ "Complete this sentence: The Nvidia GeForce Partner Program was a ...",
96
+ "ExpectedAttentionPress",
97
+ 0.5,
98
+ ],
99
+ ],
100
+ inputs=[url_input, question_input, press_selector, compression_slider],
101
+ )
102
+
103
+ submit_button.click(
104
+ process_request,
105
+ inputs=[url_input, question_input, press_selector, compression_slider],
106
+ outputs=[output, output_num_tokens],
107
+ )
108
+
109
+ return demo
110
+
111
+
112
+ if __name__ == "__main__":
113
+
114
+ # Load pipeline
115
+ device = "cuda:0"
116
+ ckpt = "meta-llama/Meta-Llama-3.1-8B-Instruct"
117
+ pipe = pipeline("kv-press-text-generation", model=ckpt, device=device, torch_dtype="auto")
118
+
119
+ # Launch demo
120
+ demo = gradio_interface()
121
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ kvpress
2
+ gradio
3
+ spaces