llm_test2 / app.py
raduqus's picture
Update app.py
c830c47 verified
raw
history blame
3.56 kB
import gradio as gr
import numpy as np
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
from huggingface_hub import login
import os
if "HF_API_TOKEN" in os.environ:
login(token=os.environ["HF_API_TOKEN"])
# Right after login(token=...):
from huggingface_hub import whoami
info = whoami()
print("Authenticated user info:", info)
device = "cuda" if torch.cuda.is_available() else "cpu"
# Configuration
model_id = "raduqus/reco_1b_16bit"
MAX_SEED = np.iinfo(np.int32).max
@spaces.GPU()
def infer(prompt, seed=0, randomize_seed=True, max_new_tokens=100, temperature=0.7, top_p=0.9):
# Random seed handling
if randomize_seed:
seed = random.randint(0, MAX_SEED)
torch.manual_seed(seed)
# Load model and tokenizer dynamically
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto" if device == "cuda" else None,
).to(device)
# Construct Alpaca-style prompt
alpaca_prompt = f"### Instruction:Based on user interests and past activities, recommend tasks they might enjoy.\n ### Input:{prompt}\n\n### Response:\n"
# Tokenize the input
inputs = tokenizer(alpaca_prompt, return_tensors="pt").to(device)
# Generate only the response
outputs = model.generate(
inputs.input_ids,
max_new_tokens=max_new_tokens, # Generate only new tokens
temperature=temperature,
top_p=top_p,
do_sample=True,
)
# Decode and return only the generated response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response_only = response.split("### Response:")[-1].strip()
return response_only
examples = [
'{"user_interests": ["fitness", "food", "community"], "past_tasks": [{"title": "Led group runs", "description": "Organized weekly jogs."}, {"title": "Tried meal prep", "description": "Cooked for a full week."}, {"title": "Joined charity walks", "description": "Helped fundraise for causes."}]}'
,
]
css = """
#col-container {
margin: 0 auto;
max-width: 520px;
}
"""
power_device = "GPU" if device == "cuda" else "CPU"
# Gradio app
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown(f"""
# ZeroGPU Text-to-Text Recommendation
Currently running on {power_device}.
""")
with gr.Row():
prompt = gr.Textbox(
label="Prompt",
placeholder="Enter your prompt",
show_label=False,
lines=2,
)
run_button = gr.Button("Generate")
result = gr.Textbox(label="Generated Recommendation", lines=4, interactive=False)
with gr.Accordion("Advanced Settings", open=False):
seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
max_length = gr.Slider(label="Max Length", minimum=10, maximum=200, value=100)
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7)
top_p = gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9)
gr.Examples(examples=examples, inputs=[prompt])
run_button.click(
fn=infer,
inputs=[prompt, seed, randomize_seed, max_length, temperature, top_p],
outputs=[result],
)
demo.queue().launch()