import gradio as gr
import numpy as np
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
from huggingface_hub import login
import os
if "HF_API_TOKEN" in os.environ:
    login(token=os.environ["HF_API_TOKEN"])
# Right after login(token=...):
from huggingface_hub import whoami
info = whoami()
print("Authenticated user info:", info)

device = "cuda" if torch.cuda.is_available() else "cpu"

# Configuration
model_id = "raduqus/reco_1b_16bit"
MAX_SEED = np.iinfo(np.int32).max

@spaces.GPU()
def infer(prompt, seed=0, randomize_seed=True, max_new_tokens=100, temperature=0.7, top_p=0.9):
    # Random seed handling
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    torch.manual_seed(seed)

    # Load model and tokenizer dynamically
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        torch_dtype=torch.float16, 
        device_map="auto" if device == "cuda" else None,
    ).to(device)
    
    # Construct Alpaca-style prompt
    alpaca_prompt = f"### Instruction:Based on user interests and past activities, recommend tasks they might enjoy.\n ### Input:{prompt}\n\n### Response:\n"

    # Tokenize the input
    inputs = tokenizer(alpaca_prompt, return_tensors="pt").to(device)

    # Generate only the response
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=max_new_tokens,  # Generate only new tokens
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )

    # Decode and return only the generated response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response_only = response.split("### Response:")[-1].strip()
    return response_only

examples = [
    '{"user_interests": ["fitness", "food", "community"], "past_tasks": [{"title": "Led group runs", "description": "Organized weekly jogs."}, {"title": "Tried meal prep", "description": "Cooked for a full week."}, {"title": "Joined charity walks", "description": "Helped fundraise for causes."}]}'
,

]

css = """
#col-container {
    margin: 0 auto;
    max-width: 520px;
}
"""

power_device = "GPU" if device == "cuda" else "CPU"

# Gradio app
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.Markdown(f"""
        # ZeroGPU Text-to-Text Recommendation
        Currently running on {power_device}.
        """)
        
        with gr.Row():
            prompt = gr.Textbox(
                label="Prompt",
                placeholder="Enter your prompt",
                show_label=False,
                lines=2,
            )
            run_button = gr.Button("Generate")
        
        result = gr.Textbox(label="Generated Recommendation", lines=4, interactive=False)
        
        with gr.Accordion("Advanced Settings", open=False):
            seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
            randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
            max_length = gr.Slider(label="Max Length", minimum=10, maximum=200, value=100)
            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7)
            top_p = gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9)
        
        gr.Examples(examples=examples, inputs=[prompt])

    run_button.click(
        fn=infer,
        inputs=[prompt, seed, randomize_seed, max_length, temperature, top_p],
        outputs=[result],
    )

demo.queue().launch()