import gradio as gr
import spaces
import torch
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline
)

##############################################################################
# ZeroGPU constraints:
#  1) No GPU calls in top-level code
#  2) Decorate GPU-using functions with @spaces.GPU(...)
##############################################################################

TEXT_PIPELINE = None  # We'll store an inference pipeline after training (if any).

# We'll train on a subset of WikiText-2 to keep it short for ZeroGPU demonstration.
NUM_EXAMPLES = 1000

@spaces.GPU(duration=300)  # 5 minutes to do a quick demo train
def finetune_small_subset():
    """
    Demonstration: 
      - Loads 'wuhp/myr1' (DeepSeek) 
      - Finetunes on a small subset of WikiText-2
      - Disables fp16 to avoid "Attempting to unscale FP16 gradients" error
      - Saves model to 'finetuned_myr1'
      - Reloads as pipeline for inference
    """
    # 1) Load dataset
    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))

    # 2) Load config, tokenizer, model
    config = AutoConfig.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        trust_remote_code=True
    )

    # If your GPU supports BF16 (e.g. A100), you can try:
    #   bf16 = True, and fp16 = False
    # Otherwise, just keep fp16=False
    # We'll do bf16=False so we definitely skip half-precision
    # (which avoids the "Attempting to unscale FP16 gradients" error).
    bf16 = False
    fp16 = False

    model = AutoModelForCausalLM.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        config=config,
        # Only auto-detect if we do normal float32 or bfloat16. 
        # (We do not want normal fp16 in training.)
        torch_dtype=torch.bfloat16 if bf16 else torch.float32,
        device_map="auto",
        trust_remote_code=True
    )

    # 3) Tokenize data
    def tokenize_fn(ex):
        return tokenizer(ex["text"], truncation=True, max_length=512)

    ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
    ds.set_format("torch")

    collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # 4) TrainingArguments
    training_args = TrainingArguments(
        output_dir="finetuned_myr1",
        num_train_epochs=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=2,
        logging_steps=10,
        save_steps=999999,       # won't save mid-training
        save_total_limit=1,
        # Turn off half precision explicitly
        fp16=fp16,
        bf16=bf16,
        # If the above doesn't fix it, remove advanced features that auto uses 
        # gradient scaling, or do more manual approach. 
    )

    # 5) Build Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds,
        data_collator=collator
    )

    # 6) Train
    trainer.train()

    # 7) Save final
    trainer.save_model("finetuned_myr1")
    tokenizer.save_pretrained("finetuned_myr1")

    # 8) Reload the newly trained model as a pipeline
    finetuned_model = AutoModelForCausalLM.from_pretrained(
        "finetuned_myr1",
        torch_dtype=torch.bfloat16 if bf16 else torch.float32,
        device_map="auto",
        trust_remote_code=True
    )
    global TEXT_PIPELINE
    TEXT_PIPELINE = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)
    return "Finetuning complete! Model reloaded for inference."

def ensure_pipeline():
    """
    If we haven't finetuned yet, or if TEXT_PIPELINE is None, 
    load the original model from 'wuhp/myr1' for inference.
    """
    global TEXT_PIPELINE
    if TEXT_PIPELINE is None:
        tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
        # We'll do float32 for inference if no BF16 or fp16.
        model = AutoModelForCausalLM.from_pretrained(
            "wuhp/myr1",
            subfolder="myr1",
            torch_dtype=torch.float32,
            device_map="auto",
            trust_remote_code=True
        )
        TEXT_PIPELINE = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return TEXT_PIPELINE

@spaces.GPU(duration=120)  # up to 2 minutes for text generation
def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
    """
    Generates text from the (finetuned) pipeline or the original model.
    Allows user to adjust temperature, top_p, and token range [260..5000].
    """
    pipe = ensure_pipeline()
    out = pipe(
        prompt,
        temperature=float(temperature),
        top_p=float(top_p),
        min_new_tokens=int(min_new_tokens),
        max_new_tokens=int(max_new_tokens),
        do_sample=True
    )
    return out[0]["generated_text"]

# Build Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## ZeroGPU Mini-Finetuning (No FP16) + Long Text Generation")

    # 1) Button to run finetune_small_subset()
    finetune_btn = gr.Button("Finetune WikiText-2 (Subset)")
    status_box = gr.Textbox(label="Finetune Status")
    finetune_btn.click(fn=finetune_small_subset, outputs=status_box)

    gr.Markdown("Use 'Generate' to produce text from either the newly finetuned or original model.")

    prompt_in = gr.Textbox(lines=3, label="Prompt")
    temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
    top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
    min_tokens = gr.Slider(260, 5000, value=260, step=10, label="Min New Tokens")
    max_tokens = gr.Slider(260, 5000, value=500, step=50, label="Max New Tokens")

    output_box = gr.Textbox(label="Generated Text", lines=12)
    gen_btn = gr.Button("Generate")

    gen_btn.click(
        fn=predict,
        inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
        outputs=output_box
    )

demo.launch()