Spaces:
Sleeping
Sleeping
Islam YAHIAOUI
commited on
Commit
·
0308e6e
1
Parent(s):
5759fed
Correction
Browse files- Helpers.py +4 -5
- __pycache__/Helpers.cpython-312.pyc +0 -0
- __pycache__/rag.cpython-312.pyc +0 -0
- app.py +20 -10
- example.py +102 -0
- rag.py +2 -3
Helpers.py
CHANGED
@@ -3,7 +3,7 @@ import json
|
|
3 |
import spacy
|
4 |
import string
|
5 |
|
6 |
-
def generate_prompt(context, question, history):
|
7 |
|
8 |
# history_summary = ""
|
9 |
# if history:
|
@@ -14,16 +14,15 @@ def generate_prompt(context, question, history):
|
|
14 |
else:
|
15 |
prompt_context = "No context provided."
|
16 |
prompt = f"""
|
17 |
-
<s>[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible based on the context, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.<</SYS>>
|
18 |
|
19 |
-
Context:
|
20 |
{prompt_context}
|
21 |
|
22 |
[INST] {question} [/INST]
|
23 |
-
|
24 |
-
Response:
|
25 |
"""
|
26 |
|
|
|
27 |
return prompt
|
28 |
|
29 |
# ==============================================================================================================================================
|
|
|
3 |
import spacy
|
4 |
import string
|
5 |
|
6 |
+
def generate_prompt(context, question, history=None):
|
7 |
|
8 |
# history_summary = ""
|
9 |
# if history:
|
|
|
14 |
else:
|
15 |
prompt_context = "No context provided."
|
16 |
prompt = f"""
|
17 |
+
<s>[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible based on the context, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content, and dont mention that you used the provided context .<</SYS>>
|
18 |
|
19 |
+
Context \n :
|
20 |
{prompt_context}
|
21 |
|
22 |
[INST] {question} [/INST]
|
|
|
|
|
23 |
"""
|
24 |
|
25 |
+
# Response:
|
26 |
return prompt
|
27 |
|
28 |
# ==============================================================================================================================================
|
__pycache__/Helpers.cpython-312.pyc
CHANGED
Binary files a/__pycache__/Helpers.cpython-312.pyc and b/__pycache__/Helpers.cpython-312.pyc differ
|
|
__pycache__/rag.cpython-312.pyc
CHANGED
Binary files a/__pycache__/rag.cpython-312.pyc and b/__pycache__/rag.cpython-312.pyc differ
|
|
app.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
|
|
3 |
from rag import run_rag
|
4 |
"""
|
5 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
6 |
"""
|
7 |
-
|
8 |
-
|
9 |
|
|
|
10 |
def respond(
|
11 |
message,
|
12 |
history: list[tuple[str, str]],
|
@@ -22,9 +24,9 @@ def respond(
|
|
22 |
messages.append({"role": "user", "content": val[0]})
|
23 |
if val[1]:
|
24 |
messages.append({"role": "assistant", "content": val[1]})
|
25 |
-
|
26 |
-
messages.append({"role": "user", "content": run_rag(message)})
|
27 |
|
|
|
|
|
28 |
response = ""
|
29 |
|
30 |
for message in client.chat_completion(
|
@@ -35,19 +37,21 @@ def respond(
|
|
35 |
top_p=top_p,
|
36 |
):
|
37 |
token = message.choices[0].delta.content
|
38 |
-
|
39 |
-
response += token
|
40 |
-
yield response
|
41 |
|
|
|
|
|
42 |
"""
|
43 |
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
44 |
"""
|
45 |
demo = gr.ChatInterface(
|
46 |
respond,
|
|
|
|
|
47 |
additional_inputs=[
|
48 |
-
gr.Textbox(value="You are a
|
49 |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
50 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
51 |
gr.Slider(
|
52 |
minimum=0.1,
|
53 |
maximum=1.0,
|
@@ -56,8 +60,14 @@ demo = gr.ChatInterface(
|
|
56 |
label="Top-p (nucleus sampling)",
|
57 |
),
|
58 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
)
|
60 |
|
61 |
|
62 |
if __name__ == "__main__":
|
63 |
-
demo.launch(
|
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
+
import os
|
4 |
from rag import run_rag
|
5 |
"""
|
6 |
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
7 |
"""
|
8 |
+
token = os.environ.get("token_HF", None)
|
9 |
+
client = InferenceClient("tiiuae/falcon-11B",token= token)
|
10 |
|
11 |
+
print(token)
|
12 |
def respond(
|
13 |
message,
|
14 |
history: list[tuple[str, str]],
|
|
|
24 |
messages.append({"role": "user", "content": val[0]})
|
25 |
if val[1]:
|
26 |
messages.append({"role": "assistant", "content": val[1]})
|
|
|
|
|
27 |
|
28 |
+
messages.append({"role": "user", "content": run_rag(message)})
|
29 |
+
|
30 |
response = ""
|
31 |
|
32 |
for message in client.chat_completion(
|
|
|
37 |
top_p=top_p,
|
38 |
):
|
39 |
token = message.choices[0].delta.content
|
40 |
+
response += str(token)
|
|
|
|
|
41 |
|
42 |
+
yield response
|
43 |
+
|
44 |
"""
|
45 |
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
46 |
"""
|
47 |
demo = gr.ChatInterface(
|
48 |
respond,
|
49 |
+
title="Retrieval Augmented Generation (RAG) Chatbot" ,
|
50 |
+
fill_height=True,
|
51 |
additional_inputs=[
|
52 |
+
gr.Textbox(value="You are a friendly Chatbot.", label="System message" ),
|
53 |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
54 |
+
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature" ),
|
55 |
gr.Slider(
|
56 |
minimum=0.1,
|
57 |
maximum=1.0,
|
|
|
60 |
label="Top-p (nucleus sampling)",
|
61 |
),
|
62 |
],
|
63 |
+
examples=[
|
64 |
+
[
|
65 |
+
"What is the capital of France?",
|
66 |
+
"What happend in 11 september 2001?",
|
67 |
+
"who is the president of the United States?"
|
68 |
+
] ],
|
69 |
)
|
70 |
|
71 |
|
72 |
if __name__ == "__main__":
|
73 |
+
demo.launch()
|
example.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import (
|
4 |
+
AutoModelForCausalLM,
|
5 |
+
AutoTokenizer,
|
6 |
+
TextIteratorStreamer,
|
7 |
+
BitsAndBytesConfig,
|
8 |
+
)
|
9 |
+
import os
|
10 |
+
from threading import Thread
|
11 |
+
import spaces
|
12 |
+
import time
|
13 |
+
|
14 |
+
token = os.environ["HF_TOKEN"]
|
15 |
+
|
16 |
+
quantization_config = BitsAndBytesConfig(
|
17 |
+
load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
|
18 |
+
)
|
19 |
+
|
20 |
+
model = AutoModelForCausalLM.from_pretrained(
|
21 |
+
"NousResearch/Hermes-2-Pro-Llama-3-8B", quantization_config=quantization_config, token=token
|
22 |
+
)
|
23 |
+
tok = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", token=token)
|
24 |
+
terminators = [
|
25 |
+
tok.eos_token_id,
|
26 |
+
tok.convert_tokens_to_ids("<|eot_id|>")
|
27 |
+
]
|
28 |
+
|
29 |
+
if torch.cuda.is_available():
|
30 |
+
device = torch.device("cuda")
|
31 |
+
print(f"Using GPU: {torch.cuda.get_device_name(device)}")
|
32 |
+
else:
|
33 |
+
device = torch.device("cpu")
|
34 |
+
print("Using CPU")
|
35 |
+
|
36 |
+
# model = model.to(device)
|
37 |
+
# Dispatch Errors
|
38 |
+
|
39 |
+
|
40 |
+
@spaces.GPU(duration=150)
|
41 |
+
def chat(message, history, temperature,do_sample, max_tokens):
|
42 |
+
chat = []
|
43 |
+
for item in history:
|
44 |
+
chat.append({"role": "user", "content": item[0]})
|
45 |
+
if item[1] is not None:
|
46 |
+
chat.append({"role": "assistant", "content": item[1]})
|
47 |
+
chat.append({"role": "user", "content": message})
|
48 |
+
messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
49 |
+
model_inputs = tok([messages], return_tensors="pt").to(device)
|
50 |
+
streamer = TextIteratorStreamer(
|
51 |
+
tok, timeout=10.0, skip_prompt=True, skip_special_tokens=True
|
52 |
+
)
|
53 |
+
generate_kwargs = dict(
|
54 |
+
model_inputs,
|
55 |
+
streamer=streamer,
|
56 |
+
max_new_tokens=max_tokens,
|
57 |
+
do_sample=True,
|
58 |
+
temperature=temperature,
|
59 |
+
eos_token_id=terminators,
|
60 |
+
)
|
61 |
+
|
62 |
+
if temperature == 0:
|
63 |
+
generate_kwargs['do_sample'] = False
|
64 |
+
|
65 |
+
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
66 |
+
t.start()
|
67 |
+
|
68 |
+
partial_text = ""
|
69 |
+
for new_text in streamer:
|
70 |
+
partial_text += new_text
|
71 |
+
yield partial_text
|
72 |
+
|
73 |
+
tokens = len(tok.tokenize(partial_text))
|
74 |
+
yield partial_text
|
75 |
+
|
76 |
+
|
77 |
+
demo = gr.ChatInterface(
|
78 |
+
fn=chat,
|
79 |
+
examples=[["Write me a poem about Machine Learning."]],
|
80 |
+
# multimodal=False,
|
81 |
+
additional_inputs_accordion=gr.Accordion(
|
82 |
+
label="⚙️ Parameters", open=False, render=False
|
83 |
+
),
|
84 |
+
additional_inputs=[
|
85 |
+
gr.Slider(
|
86 |
+
minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
|
87 |
+
),
|
88 |
+
gr.Checkbox(label="Sampling",value=True),
|
89 |
+
gr.Slider(
|
90 |
+
minimum=128,
|
91 |
+
maximum=4096,
|
92 |
+
step=1,
|
93 |
+
value=512,
|
94 |
+
label="Max new tokens",
|
95 |
+
render=False,
|
96 |
+
),
|
97 |
+
],
|
98 |
+
stop_btn="Stop Generation",
|
99 |
+
title="Chat With LLMs",
|
100 |
+
description="Now Running [NousResearch/Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B) in 4bit"
|
101 |
+
)
|
102 |
+
demo.launch()
|
rag.py
CHANGED
@@ -25,6 +25,5 @@ def run_rag(query, history=None):
|
|
25 |
indices = [result.index for result in rerank_docs.results]
|
26 |
documents = get_docs_by_indices(docs, indices)
|
27 |
prompt = generate_prompt(documents, query, history)
|
28 |
-
|
29 |
-
|
30 |
-
return prompt
|
|
|
25 |
indices = [result.index for result in rerank_docs.results]
|
26 |
documents = get_docs_by_indices(docs, indices)
|
27 |
prompt = generate_prompt(documents, query, history)
|
28 |
+
|
29 |
+
return query , prompt
|
|