Spaces:
Paused
Paused
File size: 1,871 Bytes
e52dd38 32f4e93 0c8dc33 32f4e93 057e7ea 32f4e93 76fda6c 32f4e93 76fda6c 32f4e93 e52dd38 32f4e93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, StopStringCriteria, StoppingCriteriaList
import torch
import subprocess
# useradd -m -u 1000 user
subprocess.run(['useradd', '-m', "-u", "1000", "user" ])
import torch._dynamo
torch._dynamo.config.suppress_errors = True
import os
# import pwd
# print("HERE will print PWD")
# print(pwd.getpwuid(os.getuid())[0])
# os.system("nvidia-smi")
# print("TORCH_CUDA", torch.cuda.is_available())
print("loading model")
# Load the tokenizer and model
repo_name = "nvidia/Hymba-1.5B-Instruct"
# repo_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)
model = model.cuda().to(torch.bfloat16)
print("model is loaded")
# Chat with Hymba
# prompt = input()
prompt = "Who are you?"
messages = [
{"role": "system", "content": "You are a helpful assistant."}
]
messages.append({"role": "user", "content": prompt})
# Apply chat template
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
stopping_criteria = StoppingCriteriaList([StopStringCriteria(tokenizer=tokenizer, stop_strings="</s>")])
print("generating prompt")
outputs = model.generate(
tokenized_chat,
max_new_tokens=256,
do_sample=False,
temperature=0.7,
use_cache=True,
stopping_criteria=stopping_criteria
)
input_length = tokenized_chat.shape[1]
response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
print(f"Model response: {response}")
def greet(name):
print(f"User: prompt")
print(f"Model response: {response}")
# return "Hello " + name + "!!"
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo.launch() |