myr1-2

Sleeping

App Files Files Community

wuhp commited on 8 days ago

Commit

09f030f

verified ·

1 Parent(s): adb5084

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -62

app.py CHANGED Viewed

@@ -1,44 +1,74 @@
 import gradio as gr
-import spaces
 import torch
 import faiss
-import numpy as np
 from datasets import load_dataset
 from transformers import (
     AutoConfig,
-    AutoTokenizer,
     AutoModelForCausalLM,
     DataCollatorForLanguageModeling,
     Trainer,
     TrainingArguments,
     pipeline,
-    BitsAndBytesConfig,
 )
-from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training, PeftModel
-from sentence_transformers import SentenceTransformer
-# Global variables for pipelines and settings.
-TEXT_PIPELINE = None
-COMPARISON_PIPELINE = None
-NUM_EXAMPLES = 50
 @spaces.GPU(duration=300)
-def finetune_small_subset():
     """
     Fine-tunes the custom R1 model on a small subset of the ServiceNow-AI/R1-Distill-SFT dataset.
     Steps:
       1) Loads the model from "wuhp/myr1" (using files from the "myr1" subfolder via trust_remote_code).
       2) Applies 4-bit quantization and prepares for QLoRA training.
       3) Fine-tunes on the dataset (mapping "problem" to prompt and "solution" to target).
       4) Saves the LoRA adapter to "finetuned_myr1".
       5) Reloads the adapter for inference.
     """
     # Specify the configuration ("v0" or "v1") explicitly.
     ds = load_dataset("ServiceNow-AI/R1-Distill-SFT", "v0", split="train")
-    ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
@@ -48,26 +78,8 @@ def finetune_small_subset():
     )
     # Load the custom model configuration from the repository.
-    base_config = AutoConfig.from_pretrained(
-        "wuhp/myr1",
-        subfolder="myr1",
-        trust_remote_code=True,
-    )
-    # (Optionally apply local overrides here if needed.)
-    tokenizer = AutoTokenizer.from_pretrained(
-        "wuhp/myr1",
-        subfolder="myr1",
-        trust_remote_code=True
-    )
-    base_model = AutoModelForCausalLM.from_pretrained(
-        "wuhp/myr1",
-        subfolder="myr1",
-        config=base_config,
-        quantization_config=bnb_config,
-        device_map="auto",
-        trust_remote_code=True
     )
     base_model = prepare_model_for_kbit_training(base_model)
@@ -100,8 +112,8 @@ def finetune_small_subset():
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
         logging_steps=5,
-        save_steps=999999,
-        save_total_limit=1,
         fp16=False,
     )
@@ -116,13 +128,8 @@ def finetune_small_subset():
     trainer.model.save_pretrained("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
-    base_model_2 = AutoModelForCausalLM.from_pretrained(
-        "wuhp/myr1",
-        subfolder="myr1",
-        config=base_config,
-        quantization_config=bnb_config,
-        device_map="auto",
-        trust_remote_code=True
     )
     base_model_2 = prepare_model_for_kbit_training(base_model_2)
@@ -132,13 +139,17 @@ def finetune_small_subset():
     )
     global TEXT_PIPELINE
-    TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
     return "Finetuning complete. Model loaded for inference."
-def ensure_pipeline():
     """
     Loads the base model (without LoRA) if no fine-tuned model is available.
     """
     global TEXT_PIPELINE
     if TEXT_PIPELINE is None:
@@ -148,22 +159,19 @@ def ensure_pipeline():
             bnb_4bit_use_double_quant=True,
             bnb_4bit_quant_type="nf4",
         )
-        base_config = AutoConfig.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
-        base_model = AutoModelForCausalLM.from_pretrained(
-            "wuhp/myr1",
-            subfolder="myr1",
-            config=base_config,
-            quantization_config=bnb_config,
-            device_map="auto",
-            trust_remote_code=True
         )
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
-def ensure_comparison_pipeline():
     """
     Loads the official R1 model pipeline if not already loaded.
     """
     global COMPARISON_PIPELINE
     if COMPARISON_PIPELINE is None:
@@ -177,10 +185,27 @@ def ensure_comparison_pipeline():
         COMPARISON_PIPELINE = pipeline("text-generation", model=model, tokenizer=tokenizer)
     return COMPARISON_PIPELINE
 @spaces.GPU(duration=120)
-def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
     Direct generation without retrieval using the custom R1 model.
     """
     pipe = ensure_pipeline()
     out = pipe(
@@ -193,10 +218,27 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     )
     return out[0]["generated_text"]
 @spaces.GPU(duration=120)
-def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
     Compare outputs between your custom R1 model and the official R1 model.
     """
     local_pipe = ensure_pipeline()
     comp_pipe = ensure_comparison_pipeline()
@@ -219,11 +261,22 @@ def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     )
     return local_out[0]["generated_text"], comp_out[0]["generated_text"]
 class ConversationRetriever:
     """
     A FAISS-based retriever using SentenceTransformer for embedding.
     """
-    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", embed_dim=384):
         self.embed_model = SentenceTransformer(model_name)
         self.embed_dim = embed_dim
         self.index = faiss.IndexFlatL2(embed_dim)
@@ -232,7 +285,13 @@ class ConversationRetriever:
         self.ids = []
         self.id_counter = 0
-    def add_text(self, text):
         if not text.strip():
             return
         emb = self.embed_model.encode([text], convert_to_numpy=True)
@@ -243,7 +302,17 @@ class ConversationRetriever:
         self.ids.append(self.id_counter)
         self.id_counter += 1
-    def search(self, query, top_k=3):
         q_emb = self.embed_model.encode([query], convert_to_numpy=True).astype(np.float32)
         q_vec = q_emb[0].reshape(1, -1)
         distances, indices = self.index.search(q_vec, top_k)
@@ -253,11 +322,20 @@ class ConversationRetriever:
                 results.append((self.texts[idx], dist))
         return results
 retriever = ConversationRetriever()
-def build_rag_prompt(user_query, retrieved_chunks):
     """
     Builds a prompt for retrieval-augmented generation.
     """
     context_str = ""
     for i, (chunk, dist) in enumerate(retrieved_chunks):
@@ -269,10 +347,29 @@ def build_rag_prompt(user_query, retrieved_chunks):
     )
     return prompt
 @spaces.GPU(duration=120)
-def chat_rag(user_input, history, temperature, top_p, min_new_tokens, max_new_tokens):
     """
-    Chat with retrieval augmentation.
     """
     pipe = ensure_pipeline()
     retriever.add_text(f"User: {user_input}")
@@ -297,6 +394,7 @@ def chat_rag(user_input, history, temperature, top_p, min_new_tokens, max_new_to
     history.append([user_input, assistant_reply])
     return history, history
 # Build the Gradio interface.
 with gr.Blocks() as demo:
     gr.Markdown("# QLoRA Fine-tuning & RAG-based Chat Demo using Custom R1 Model")
@@ -351,4 +449,4 @@ with gr.Blocks() as demo:
         outputs=[chat_state, chatbot]
     )
-demo.launch()

 import gradio as gr
+import numpy as np
 import torch
 import faiss
+import spaces
 from datasets import load_dataset
+from peft import LoraConfig, PeftModel, TaskType, get_peft_model, prepare_model_for_kbit_training
+from sentence_transformers import SentenceTransformer
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
     DataCollatorForLanguageModeling,
     Trainer,
     TrainingArguments,
     pipeline,
 )
+NUM_EXAMPLES_FOR_FINETUNING = 50  # Constant for the number of examples to use for finetuning
+TEXT_PIPELINE = None  # Global to store the custom R1 text generation pipeline
+COMPARISON_PIPELINE = None # Global to store the official R1 text generation pipeline
+def _load_model_and_tokenizer(model_name: str, subfolder: str = None, quantization_config: BitsAndBytesConfig = None, device_map: str = "auto", trust_remote_code: bool = True) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
+    """
+    Helper function to load a causal language model and its tokenizer.
+    Args:
+        model_name (str): The name or path of the pretrained model.
+        subfolder (str, optional): Subfolder within the model repository. Defaults to None.
+        quantization_config (BitsAndBytesConfig, optional): Configuration for quantization. Defaults to None.
+        device_map (str, optional): Device mapping for model loading. Defaults to "auto".
+        trust_remote_code (bool, optional): Trust remote code for custom models. Defaults to True.
+    Returns:
+        tuple[AutoModelForCausalLM, AutoTokenizer]: The loaded model and tokenizer.
+    """
+    config = AutoConfig.from_pretrained(model_name, subfolder=subfolder, trust_remote_code=trust_remote_code)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder=subfolder, trust_remote_code=trust_remote_code)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        subfolder=subfolder,
+        config=config,
+        quantization_config=quantization_config,
+        device_map=device_map,
+        trust_remote_code=trust_remote_code
+    )
+    return model, tokenizer
 @spaces.GPU(duration=300)
+def finetune_small_subset() -> str:
     """
     Fine-tunes the custom R1 model on a small subset of the ServiceNow-AI/R1-Distill-SFT dataset.
     Steps:
       1) Loads the model from "wuhp/myr1" (using files from the "myr1" subfolder via trust_remote_code).
       2) Applies 4-bit quantization and prepares for QLoRA training.
       3) Fine-tunes on the dataset (mapping "problem" to prompt and "solution" to target).
       4) Saves the LoRA adapter to "finetuned_myr1".
       5) Reloads the adapter for inference.
+    Returns:
+        str: A message indicating finetuning completion.
     """
     # Specify the configuration ("v0" or "v1") explicitly.
     ds = load_dataset("ServiceNow-AI/R1-Distill-SFT", "v0", split="train")
+    ds = ds.select(range(min(NUM_EXAMPLES_FOR_FINETUNING, len(ds))))
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
     )
     # Load the custom model configuration from the repository.
+    base_model, tokenizer = _load_model_and_tokenizer(
+        "wuhp/myr1", subfolder="myr1", quantization_config=bnb_config, device_map="auto"
     )
     base_model = prepare_model_for_kbit_training(base_model)
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
         logging_steps=5,
+        save_steps=999999, # Save infrequently to avoid filling up disk during demo
+        save_total_limit=1, # Keep only the last saved checkpoint
         fp16=False,
     )
     trainer.model.save_pretrained("finetuned_myr1")
     tokenizer.save_pretrained("finetuned_myr1")
+    base_model_2, tokenizer_2 = _load_model_and_tokenizer( # Re-load base model for inference adapter application
+        "wuhp/myr1", subfolder="myr1", quantization_config=bnb_config, device_map="auto"
     )
     base_model_2 = prepare_model_for_kbit_training(base_model_2)
     )
     global TEXT_PIPELINE
+    TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer_2) # Use tokenizer_2 here to be consistent
     return "Finetuning complete. Model loaded for inference."
+def ensure_pipeline() -> pipeline:
     """
     Loads the base model (without LoRA) if no fine-tuned model is available.
+    Returns:
+        pipeline: The text generation pipeline using the custom R1 model.
     """
     global TEXT_PIPELINE
     if TEXT_PIPELINE is None:
             bnb_4bit_use_double_quant=True,
             bnb_4bit_quant_type="nf4",
         )
+        base_model, tokenizer = _load_model_and_tokenizer(
+            "wuhp/myr1", subfolder="myr1", quantization_config=bnb_config, device_map="auto"
         )
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
+def ensure_comparison_pipeline() -> pipeline:
     """
     Loads the official R1 model pipeline if not already loaded.
+    Returns:
+        pipeline: The text generation pipeline using the official R1 model.
     """
     global COMPARISON_PIPELINE
     if COMPARISON_PIPELINE is None:
         COMPARISON_PIPELINE = pipeline("text-generation", model=model, tokenizer=tokenizer)
     return COMPARISON_PIPELINE
 @spaces.GPU(duration=120)
+def predict(
+    prompt: str,
+    temperature: float,
+    top_p: float,
+    min_new_tokens: int,
+    max_new_tokens: int
+) -> str:
     """
     Direct generation without retrieval using the custom R1 model.
+    Args:
+        prompt (str): The input prompt for text generation.
+        temperature (float): Sampling temperature.
+        top_p (float): Top-p sampling probability.
+        min_new_tokens (int): Minimum number of new tokens to generate.
+        max_new_tokens (int): Maximum number of new tokens to generate.
+    Returns:
+        str: The generated text output.
     """
     pipe = ensure_pipeline()
     out = pipe(
     )
     return out[0]["generated_text"]
 @spaces.GPU(duration=120)
+def compare_models(
+    prompt: str,
+    temperature: float,
+    top_p: float,
+    min_new_tokens: int,
+    max_new_tokens: int
+) -> tuple[str, str]:
     """
     Compare outputs between your custom R1 model and the official R1 model.
+    Args:
+        prompt (str): The input prompt for text generation.
+        temperature (float): Sampling temperature.
+        top_p (float): Top-p sampling probability.
+        min_new_tokens (int): Minimum number of new tokens to generate.
+        max_new_tokens (int): Maximum number of new tokens to generate.
+    Returns:
+        tuple[str, str]: A tuple containing the generated text from the custom R1 and official R1 models.
     """
     local_pipe = ensure_pipeline()
     comp_pipe = ensure_comparison_pipeline()
     )
     return local_out[0]["generated_text"], comp_out[0]["generated_text"]
 class ConversationRetriever:
     """
     A FAISS-based retriever using SentenceTransformer for embedding.
+    This class indexes text chunks using FAISS and SentenceTransformer embeddings
+    to enable efficient similarity search for retrieval-augmented generation.
     """
+    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", embed_dim: int = 384):
+        """
+        Initializes the ConversationRetriever.
+        Args:
+            model_name (str, optional): Name of the SentenceTransformer model. Defaults to "sentence-transformers/all-MiniLM-L6-v2".
+            embed_dim (int, optional): Dimensionality of the embeddings. Defaults to 384.
+        """
         self.embed_model = SentenceTransformer(model_name)
         self.embed_dim = embed_dim
         self.index = faiss.IndexFlatL2(embed_dim)
         self.ids = []
         self.id_counter = 0
+    def add_text(self, text: str):
+        """
+        Adds text to the retriever's index.
+        Args:
+            text (str): The text to add.
+        """
         if not text.strip():
             return
         emb = self.embed_model.encode([text], convert_to_numpy=True)
         self.ids.append(self.id_counter)
         self.id_counter += 1
+    def search(self, query: str, top_k: int = 3) -> list[tuple[str, float]]:
+        """
+        Searches the retriever index for texts similar to the query.
+        Args:
+            query (str): The query text.
+            top_k (int, optional): Number of top results to retrieve. Defaults to 3.
+        Returns:
+            list[tuple[str, float]]: A list of tuples, where each tuple contains (text, distance).
+        """
         q_emb = self.embed_model.encode([query], convert_to_numpy=True).astype(np.float32)
         q_vec = q_emb[0].reshape(1, -1)
         distances, indices = self.index.search(q_vec, top_k)
                 results.append((self.texts[idx], dist))
         return results
 retriever = ConversationRetriever()
+def build_rag_prompt(user_query: str, retrieved_chunks: list[tuple[str, float]]) -> str:
     """
     Builds a prompt for retrieval-augmented generation.
+    Args:
+        user_query (str): The user's input query.
+        retrieved_chunks (list[tuple[str, float]]): List of retrieved text chunks and their distances.
+    Returns:
+        str: The formatted prompt string.
     """
     context_str = ""
     for i, (chunk, dist) in enumerate(retrieved_chunks):
     )
     return prompt
 @spaces.GPU(duration=120)
+def chat_rag(
+    user_input: str,
+    history: list[list[str]],
+    temperature: float,
+    top_p: float,
+    min_new_tokens: int,
+    max_new_tokens: int
+) -> tuple[list[list[str]], list[list[str]]]:
     """
+    Chat with retrieval augmentation using the custom R1 model.
+    Args:
+        user_input (str): The user's chat input.
+        history (list[list[str]]): The chat history.
+        temperature (float): Sampling temperature.
+        top_p (float): Top-p sampling probability.
+        min_new_tokens (int): Minimum number of new tokens to generate.
+        max_new_tokens (int): Maximum number of new tokens to generate.
+    Returns:
+        tuple[list[list[str]], list[list[str]]]: Updated chat history and chatbot display history.
     """
     pipe = ensure_pipeline()
     retriever.add_text(f"User: {user_input}")
     history.append([user_input, assistant_reply])
     return history, history
 # Build the Gradio interface.
 with gr.Blocks() as demo:
     gr.Markdown("# QLoRA Fine-tuning & RAG-based Chat Demo using Custom R1 Model")
         outputs=[chat_state, chatbot]
     )
+demo.launch()