molbal commited on
Commit
12fa70f
·
1 Parent(s): 312ff65
Files changed (1) hide show
  1. app.py +35 -50
app.py CHANGED
@@ -1,35 +1,22 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
  client = InferenceClient("molbal/CRA-v1-7B")
8
 
9
- def respond(
10
- message,
11
- history: list[tuple[str, str]],
12
- system_message,
13
- max_tokens,
14
- temperature,
15
- top_p,
16
- ):
17
- messages = [
18
- {"role": "system", "content": "You are a writer’s assistant."},
19
- {"role": "system", "content": system_message},
20
- ]
21
-
22
- for val in history:
23
- if val[0]:
24
- messages.append({"role": "user", "content": val[0]})
25
- if val[1]:
26
- messages.append({"role": "assistant", "content": val[1]})
27
-
28
  messages.append({"role": "user", "content": message})
29
-
30
  response = ""
31
-
32
- for message in client.chat_completion(
33
  messages,
34
  max_tokens=max_tokens,
35
  stream=True,
@@ -38,31 +25,29 @@ def respond(
38
  num_ctx=16384,
39
  repeat_penalty=1.05,
40
  ):
41
- token = message.choices[0].delta.content
42
-
43
  response += token
44
  yield response
45
 
46
- """
47
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
48
- """
49
- demo = gr.ChatInterface(
50
- respond,
51
- additional_inputs=[
52
- gr.Textbox(value="Understand how the story flows, what motivations the characters have and how they will interact with each other and the world as a step by step thought process before continuing the story.", label="System message"),
53
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
54
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
55
- gr.Slider(
56
- minimum=0.1,
57
- maximum=1.0,
58
- value=0.8,
59
- step=0.05,
60
- label="Top-p (nucleus sampling)",
61
- ),
62
- ],
63
- )
64
-
65
- demo.launch()
66
-
67
- # Add an alert to mention that this runs on CPU
68
- gr.Markdown("**Note: This model runs on CPU, so it will be slow.**")
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
+ # Use the CRA-v1-7B model (which uses the GGUF file internally)
 
 
5
  client = InferenceClient("molbal/CRA-v1-7B")
6
 
7
+ def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
8
+ # Build the conversation history; always include the system message
9
+ messages = [{"role": "system", "content": system_message}]
10
+ for user_msg, assistant_msg in history:
11
+ if user_msg:
12
+ messages.append({"role": "user", "content": user_msg})
13
+ if assistant_msg:
14
+ messages.append({"role": "assistant", "content": assistant_msg})
 
 
 
 
 
 
 
 
 
 
 
15
  messages.append({"role": "user", "content": message})
16
+
17
  response = ""
18
+ # Call the model with streaming and the new parameters
19
+ for chunk in client.chat_completion(
20
  messages,
21
  max_tokens=max_tokens,
22
  stream=True,
 
25
  num_ctx=16384,
26
  repeat_penalty=1.05,
27
  ):
28
+ token = chunk.choices[0].delta.content
 
29
  response += token
30
  yield response
31
 
32
+ # Create an alert message to inform users that inference runs on CPU (and will be slow)
33
+ cpu_alert = gr.Markdown("**Note:** This model runs on CPU, so inference may be slow.")
34
+
35
+ # Build the UI using Blocks to combine the alert and the ChatInterface
36
+ with gr.Blocks() as demo:
37
+ cpu_alert.render()
38
+ chat_interface = gr.ChatInterface(
39
+ respond,
40
+ additional_inputs=[
41
+ gr.Textbox(
42
+ value="### System: You are a writer’s assistant. ### Task: Understand how the story flows, what motivations the characters have and how they will interact with each other and the world as a step by step thought process before continuing the story. ### Context: {context}",
43
+ label="System message"
44
+ ),
45
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
46
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
47
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.05, label="Top-p (nucleus sampling)")
48
+ ]
49
+ )
50
+ chat_interface.render()
51
+
52
+ if __name__ == "__main__":
53
+ demo.launch()