File size: 1,871 Bytes
e52dd38
32f4e93
 
 
 
0c8dc33
 
 
 
 
32f4e93
 
 
 
057e7ea
 
 
32f4e93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76fda6c
32f4e93
 
 
 
 
 
76fda6c
32f4e93
 
 
 
e52dd38
32f4e93
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr

from transformers import AutoModelForCausalLM, AutoTokenizer, StopStringCriteria, StoppingCriteriaList
import torch

import subprocess
# useradd -m -u 1000 user
subprocess.run(['useradd', '-m', "-u", "1000", "user" ])   


import torch._dynamo
torch._dynamo.config.suppress_errors = True

import os
# import pwd
# print("HERE will print PWD")
# print(pwd.getpwuid(os.getuid())[0])
# os.system("nvidia-smi")
# print("TORCH_CUDA", torch.cuda.is_available())


print("loading model")
# Load the tokenizer and model
repo_name = "nvidia/Hymba-1.5B-Instruct"
# repo_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)

model = model.cuda().to(torch.bfloat16)

print("model is loaded")


# Chat with Hymba
# prompt = input()
prompt = "Who are you?"

messages = [
    {"role": "system", "content": "You are a helpful assistant."}
]
messages.append({"role": "user", "content": prompt})

# Apply chat template
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
stopping_criteria = StoppingCriteriaList([StopStringCriteria(tokenizer=tokenizer, stop_strings="</s>")])

print("generating prompt")


outputs = model.generate(
    tokenized_chat, 
    max_new_tokens=256,
    do_sample=False,
    temperature=0.7,
    use_cache=True,
    stopping_criteria=stopping_criteria
)
input_length = tokenized_chat.shape[1]
response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)

print(f"Model response: {response}")



def greet(name):
    print(f"User: prompt")
    print(f"Model response: {response}")
    # return "Hello " + name + "!!"

demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo.launch()