pmolchanov commited on
Commit
76fda6c
·
verified ·
1 Parent(s): 7c87075

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -1
app.py CHANGED
@@ -1,7 +1,44 @@
1
  import gradio as gr
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  demo.launch()
 
1
  import gradio as gr
2
 
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, StopStringCriteria, StoppingCriteriaList
4
+ import torch
5
+
6
+ # Load the tokenizer and model
7
+ repo_name = "nvidia/Hymba-1.5B-Instruct"
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
10
+ model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)
11
+ model = model.cuda().to(torch.bfloat16)
12
+
13
+ # Chat with Hymba
14
+ # prompt = input()
15
+ prompt = "Who are you?"
16
+
17
+ messages = [
18
+ {"role": "system", "content": "You are a helpful assistant."}
19
+ ]
20
+ messages.append({"role": "user", "content": prompt})
21
+
22
+ # Apply chat template
23
+ tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
24
+ stopping_criteria = StoppingCriteriaList([StopStringCriteria(tokenizer=tokenizer, stop_strings="</s>")])
25
+ outputs = model.generate(
26
+ tokenized_chat,
27
+ max_new_tokens=256,
28
+ do_sample=False,
29
+ temperature=0.7,
30
+ use_cache=True,
31
+ stopping_criteria=stopping_criteria
32
+ )
33
+ input_length = tokenized_chat.shape[1]
34
+ response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
35
+
36
+
37
+
38
  def greet(name):
39
+ print(f"User: prompt")
40
+ print(f"Model response: {response}")
41
+ # return "Hello " + name + "!!"
42
 
43
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
44
  demo.launch()