myr1-2

Sleeping

App Files Files Community

wuhp commited on 8 days ago

Commit

3986b4b

verified ·

1 Parent(s): 09f030f

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -46

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ from transformers import (
 NUM_EXAMPLES_FOR_FINETUNING = 50  # Constant for the number of examples to use for finetuning
 TEXT_PIPELINE = None  # Global to store the custom R1 text generation pipeline
-COMPARISON_PIPELINE = None # Global to store the official R1 text generation pipeline
 def _load_model_and_tokenizer(model_name: str, subfolder: str = None, quantization_config: BitsAndBytesConfig = None, device_map: str = "auto", trust_remote_code: bool = True) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
@@ -66,7 +66,6 @@ def finetune_small_subset() -> str:
     Returns:
         str: A message indicating finetuning completion.
     """
-    # Specify the configuration ("v0" or "v1") explicitly.
     ds = load_dataset("ServiceNow-AI/R1-Distill-SFT", "v0", split="train")
     ds = ds.select(range(min(NUM_EXAMPLES_FOR_FINETUNING, len(ds))))
@@ -76,8 +75,6 @@ def finetune_small_subset() -> str:
         bnb_4bit_use_double_quant=True,
         bnb_4bit_quant_type="nf4",
     )
-    # Load the custom model configuration from the repository.
     base_model, tokenizer = _load_model_and_tokenizer(
         "wuhp/myr1", subfolder="myr1", quantization_config=bnb_config, device_map="auto"
     )
@@ -112,8 +109,8 @@ def finetune_small_subset() -> str:
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
         logging_steps=5,
-        save_steps=999999, # Save infrequently to avoid filling up disk during demo
-        save_total_limit=1, # Keep only the last saved checkpoint
         fp16=False,
     )
@@ -128,7 +125,7 @@ def finetune_small_subset() -> str:
     trainer.model.save_pretrained("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
-    base_model_2, tokenizer_2 = _load_model_and_tokenizer( # Re-load base model for inference adapter application
         "wuhp/myr1", subfolder="myr1", quantization_config=bnb_config, device_map="auto"
     )
     base_model_2 = prepare_model_for_kbit_training(base_model_2)
@@ -139,7 +136,7 @@ def finetune_small_subset() -> str:
     )
     global TEXT_PIPELINE
-    TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer_2) # Use tokenizer_2 here to be consistent
     return "Finetuning complete. Model loaded for inference."
@@ -205,18 +202,26 @@ def predict(
         max_new_tokens (int): Maximum number of new tokens to generate.
     Returns:
-        str: The generated text output.
     """
     pipe = ensure_pipeline()
-    out = pipe(
         prompt,
         temperature=float(temperature),
         top_p=float(top_p),
         min_new_tokens=int(min_new_tokens),
         max_new_tokens=int(max_new_tokens),
         do_sample=True
-    )
-    return out[0]["generated_text"]
 @spaces.GPU(duration=120)
@@ -238,28 +243,41 @@ def compare_models(
         max_new_tokens (int): Maximum number of new tokens to generate.
     Returns:
-        tuple[str, str]: A tuple containing the generated text from the custom R1 and official R1 models.
     """
     local_pipe = ensure_pipeline()
     comp_pipe = ensure_comparison_pipeline()
-    local_out = local_pipe(
         prompt,
         temperature=float(temperature),
         top_p=float(top_p),
         min_new_tokens=int(min_new_tokens),
         max_new_tokens=int(max_new_tokens),
         do_sample=True
-    )
-    comp_out = comp_pipe(
         prompt,
         temperature=float(temperature),
         top_p=float(top_p),
         min_new_tokens=int(min_new_tokens),
         max_new_tokens=int(max_new_tokens),
         do_sample=True
-    )
-    return local_out[0]["generated_text"], comp_out[0]["generated_text"]
 class ConversationRetriever:
@@ -335,15 +353,20 @@ def build_rag_prompt(user_query: str, retrieved_chunks: list[tuple[str, float]])
         retrieved_chunks (list[tuple[str, float]]): List of retrieved text chunks and their distances.
     Returns:
-        str: The formatted prompt string.
     """
     context_str = ""
-    for i, (chunk, dist) in enumerate(retrieved_chunks):
-        context_str += f"Chunk #{i+1} (similarity ~ {dist:.2f}):\n{chunk}\n\n"
     prompt = (
-        f"User's Query:\n{user_query}\n\n"
-        f"Relevant Context:\n{context_str}"
-        "Assistant:"
     )
     return prompt
@@ -369,13 +392,18 @@ def chat_rag(
         max_new_tokens (int): Maximum number of new tokens to generate.
     Returns:
-        tuple[list[list[str]], list[list[str]]]: Updated chat history and chatbot display history.
     """
     pipe = ensure_pipeline()
     retriever.add_text(f"User: {user_input}")
     top_k = 3
     results = retriever.search(user_input, top_k=top_k)
     prompt = build_rag_prompt(user_input, results)
     output = pipe(
         prompt,
         temperature=float(temperature),
@@ -385,10 +413,14 @@ def chat_rag(
         do_sample=True
     )[0]["generated_text"]
-    if output.startswith(prompt):
-        assistant_reply = output[len(prompt):].strip()
     else:
-        assistant_reply = output.strip()
     retriever.add_text(f"Assistant: {assistant_reply}")
     history.append([user_input, assistant_reply])
@@ -398,46 +430,56 @@ def chat_rag(
 # Build the Gradio interface.
 with gr.Blocks() as demo:
     gr.Markdown("# QLoRA Fine-tuning & RAG-based Chat Demo using Custom R1 Model")
-    finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on ServiceNow-AI/R1-Distill-SFT subset (up to 5 min)")
-    status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
-    gr.Markdown("## Direct Generation (No Retrieval) using Custom R1")
-    prompt_in = gr.Textbox(lines=3, label="Prompt")
-    temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
-    top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p")
     min_tokens = gr.Slider(1, 2500, value=50, step=10, label="Min New Tokens")
     max_tokens = gr.Slider(1, 2500, value=200, step=50, label="Max New Tokens")
-    output_box = gr.Textbox(label="Custom R1 Output", lines=8)
-    gen_btn = gr.Button("Generate with Custom R1")
     gen_btn.click(
         fn=predict,
         inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
         outputs=output_box
     )
-    gr.Markdown("## Compare Custom R1 vs Official R1")
-    compare_btn = gr.Button("Compare")
-    out_custom = gr.Textbox(label="Custom R1 Output", lines=6)
-    out_official = gr.Textbox(label="Official R1 Output", lines=6)
     compare_btn.click(
         fn=compare_models,
-        inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
         outputs=[out_custom, out_official]
     )
-    gr.Markdown("## Chat with Retrieval-Augmented Memory")
     with gr.Row():
         with gr.Column():
-            chatbot = gr.Chatbot(label="RAG Chat")
             chat_state = gr.State([])
             user_input = gr.Textbox(
                 show_label=False,
-                placeholder="Ask a question...",
                 lines=2
             )
-            send_btn = gr.Button("Send")
     user_input.submit(
         fn=chat_rag,
         inputs=[user_input, chat_state, temperature, top_p, min_tokens, max_tokens],
@@ -448,5 +490,7 @@ with gr.Blocks() as demo:
         inputs=[user_input, chat_state, temperature, top_p, min_tokens, max_tokens],
         outputs=[chat_state, chatbot]
     )
 demo.launch()

 NUM_EXAMPLES_FOR_FINETUNING = 50  # Constant for the number of examples to use for finetuning
 TEXT_PIPELINE = None  # Global to store the custom R1 text generation pipeline
+COMPARISON_PIPELINE = None  # Global to store the official R1 text generation pipeline
 def _load_model_and_tokenizer(model_name: str, subfolder: str = None, quantization_config: BitsAndBytesConfig = None, device_map: str = "auto", trust_remote_code: bool = True) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
     Returns:
         str: A message indicating finetuning completion.
     """
     ds = load_dataset("ServiceNow-AI/R1-Distill-SFT", "v0", split="train")
     ds = ds.select(range(min(NUM_EXAMPLES_FOR_FINETUNING, len(ds))))
         bnb_4bit_use_double_quant=True,
         bnb_4bit_quant_type="nf4",
     )
     base_model, tokenizer = _load_model_and_tokenizer(
         "wuhp/myr1", subfolder="myr1", quantization_config=bnb_config, device_map="auto"
     )
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
         logging_steps=5,
+        save_steps=999999,
+        save_total_limit=1,
         fp16=False,
     )
     trainer.model.save_pretrained("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
+    base_model_2, tokenizer_2 = _load_model_and_tokenizer(
         "wuhp/myr1", subfolder="myr1", quantization_config=bnb_config, device_map="auto"
     )
     base_model_2 = prepare_model_for_kbit_training(base_model_2)
     )
     global TEXT_PIPELINE
+    TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer_2)
     return "Finetuning complete. Model loaded for inference."
         max_new_tokens (int): Maximum number of new tokens to generate.
     Returns:
+        str: The generated text output with "Thinking Process" and "Solution" sections.
     """
     pipe = ensure_pipeline()
+    thinking_prefix = "**Thinking Process:**\n"
+    solution_prefix = "\n**Solution:**\n"
+    formatted_output = thinking_prefix
+    output = pipe(
         prompt,
         temperature=float(temperature),
         top_p=float(top_p),
         min_new_tokens=int(min_new_tokens),
         max_new_tokens=int(max_new_tokens),
         do_sample=True
+    )[0]["generated_text"]
+    formatted_output += output.strip() + solution_prefix
+    formatted_output += "Final Answer (This part is a placeholder and needs better extraction): ... "
+    return formatted_output
 @spaces.GPU(duration=120)
         max_new_tokens (int): Maximum number of new tokens to generate.
     Returns:
+        tuple[str, str]: A tuple containing the formatted generated text from the custom R1 and official R1 models, each with "Thinking Process" and "Solution" sections.
     """
     local_pipe = ensure_pipeline()
     comp_pipe = ensure_comparison_pipeline()
+    def format_comparison_output(model_name, raw_output):
+        thinking_prefix = f"**{model_name} - Thinking Process:**\n"
+        solution_prefix = f"\n**{model_name} - Solution:**\n"
+        formatted_output = thinking_prefix
+        formatted_output += raw_output.strip() + solution_prefix
+        formatted_output += f"{model_name} Final Answer: ... "
+        return formatted_output
+    local_out_raw = local_pipe(
         prompt,
         temperature=float(temperature),
         top_p=float(top_p),
         min_new_tokens=int(min_new_tokens),
         max_new_tokens=int(max_new_tokens),
         do_sample=True
+    )[0]["generated_text"]
+    comp_out_raw = comp_pipe(
         prompt,
         temperature=float(temperature),
         top_p=float(top_p),
         min_new_tokens=int(min_new_tokens),
         max_new_tokens=int(max_new_tokens),
         do_sample=True
+    )[0]["generated_text"]
+    local_out_formatted = format_comparison_output("Custom R1", local_out_raw)
+    comp_out_formatted = format_comparison_output("Official R1", comp_out_raw)
+    return local_out_formatted, comp_out_formatted
 class ConversationRetriever:
         retrieved_chunks (list[tuple[str, float]]): List of retrieved text chunks and their distances.
     Returns:
+        str: The formatted prompt string including instructions for step-by-step thinking and using context.
     """
     context_str = ""
+    if retrieved_chunks:
+        context_str += "**Relevant Context:**\n"
+        for i, (chunk, dist) in enumerate(retrieved_chunks):
+            context_str += f"Chunk #{i+1} (similarity ~ {dist:.2f}):\n> {chunk}\n\n"
+    prompt_instruction = "Please provide a detailed answer, showing your thinking process step-by-step before stating the final answer. Use the provided context if relevant."
     prompt = (
+        f"**User Query:**\n{user_query}\n\n"
+        f"{context_str}\n"
+        f"{prompt_instruction}\n\n"
+        "**Answer:**\n"
     )
     return prompt
         max_new_tokens (int): Maximum number of new tokens to generate.
     Returns:
+        tuple[list[list[str]], list[list[str]]]: Updated chat history and chatbot display history, with formatted assistant replies.
     """
     pipe = ensure_pipeline()
     retriever.add_text(f"User: {user_input}")
     top_k = 3
     results = retriever.search(user_input, top_k=top_k)
     prompt = build_rag_prompt(user_input, results)
+    thinking_prefix = "**Thinking Process:**\n"
+    solution_prefix = "\n**Solution:**\n"
+    formatted_output = thinking_prefix
     output = pipe(
         prompt,
         temperature=float(temperature),
         do_sample=True
     )[0]["generated_text"]
+    formatted_output += output.strip() + solution_prefix
+    formatted_output += "Final Answer (This part is a placeholder and needs better extraction): ... "
+    assistant_reply = formatted_output
+    if assistant_reply.startswith(prompt):
+        assistant_reply = assistant_reply[len(prompt):].strip()
     else:
+        assistant_reply = assistant_reply.strip()
     retriever.add_text(f"Assistant: {assistant_reply}")
     history.append([user_input, assistant_reply])
 # Build the Gradio interface.
 with gr.Blocks() as demo:
     gr.Markdown("# QLoRA Fine-tuning & RAG-based Chat Demo using Custom R1 Model")
+    gr.Markdown("---")
+    gr.Markdown("## ⚙️ Fine-tuning (Optional)")
+    gr.Markdown("This section allows you to fine-tune the custom R1 model on a small subset of the ServiceNow dataset. This step is optional but can potentially improve the model's performance on ServiceNow-related tasks. **Note:** This process may take up to 5 minutes.")
+    finetune_btn = gr.Button("🚀 Start Fine-tuning (QLoRA)")
+    status_box = gr.Textbox(label="Fine-tuning Status", interactive=False)
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
+    gr.Markdown("---")
+    gr.Markdown("## ✍️ Direct Generation (No Retrieval)")
+    gr.Markdown("Enter a prompt below to generate text directly using the custom R1 model. This is standard text generation without retrieval augmentation.")
+    prompt_in = gr.Textbox(lines=3, label="Input Prompt", placeholder="Enter your prompt here...")
+    temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature (Creativity)")
+    top_p = gr.Slider(0.0, 1.0, step=0.05, value=0.9, label="Top-p (Sampling Nucleus)")
     min_tokens = gr.Slider(1, 2500, value=50, step=10, label="Min New Tokens")
     max_tokens = gr.Slider(1, 2500, value=200, step=50, label="Max New Tokens")
+    output_box = gr.Textbox(label="Custom R1 Output", lines=8, interactive=False)
+    gen_btn = gr.Button("✨ Generate Text")
     gen_btn.click(
         fn=predict,
         inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
         outputs=output_box
     )
+    gr.Markdown("---")
+    gr.Markdown("## 🆚 Compare Custom R1 vs Official R1")
+    gr.Markdown("Enter a prompt to compare the text generation of your fine-tuned custom R1 model with the official DeepSeek-R1-Distill-Llama-8B model.")
+    compare_prompt_in = gr.Textbox(lines=3, label="Comparison Prompt", placeholder="Enter prompt for comparison...")
+    compare_btn = gr.Button("⚖️ Compare Models")
+    out_custom = gr.Textbox(label="Custom R1 Output", lines=6, interactive=False)
+    out_official = gr.Textbox(label="Official R1 Output", lines=6, interactive=False)
     compare_btn.click(
         fn=compare_models,
+        inputs=[compare_prompt_in, temperature, top_p, min_tokens, max_tokens],
         outputs=[out_custom, out_official]
     )
+    gr.Markdown("---")
+    gr.Markdown("## 💬 Chat with Retrieval-Augmented Memory (RAG)")
+    gr.Markdown("Chat with the custom R1 model, enhanced with a retrieval-augmented memory. The model will retrieve relevant information based on your queries to provide more informed responses.")
     with gr.Row():
         with gr.Column():
+            chatbot = gr.Chatbot(label="RAG Chatbot")
             chat_state = gr.State([])
             user_input = gr.Textbox(
                 show_label=False,
+                placeholder="Ask a question to the RAG Chatbot...",
                 lines=2
             )
+            send_btn = gr.Button("➡️ Send")
     user_input.submit(
         fn=chat_rag,
         inputs=[user_input, chat_state, temperature, top_p, min_tokens, max_tokens],
         inputs=[user_input, chat_state, temperature, top_p, min_tokens, max_tokens],
         outputs=[chat_state, chatbot]
     )
+    gr.Markdown("---")
 demo.launch()