Spaces:

ysharma
/

Chat_with_Meta_llama3_1_8b

Running on Zero

App Files Files Community

Update app.py

by hysts HF staff - opened Jul 23

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+12

-13

Files changed (1) hide show

app.py +12 -13

app.py CHANGED Viewed

@@ -1,12 +1,9 @@
 import gradio as gr
 import os
 import spaces
-from transformers import GemmaTokenizer, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-# Set an environment variable
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
 TITLE = '''
 <h1 style="text-align: center;">Meta Llama3.1 8B <a href="https://huggingface.co/spaces/ysharma/Chat_with_Meta_llama3_1_8b?duplicate=true" id="duplicate-button"><button style="color:white">Duplicate this Space</button></a></h1>
@@ -47,16 +44,18 @@ h1 {
 }
 """
-model = "llhf/Meta-Llama-3.1-8B-Instruct"
 # Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained(f"{model}")
-model = AutoModelForCausalLM.from_pretrained(f"{model}", device_map="auto")
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
 # Gradio inference function
 @spaces.GPU(duration=120)
 def chat_llama3_1_8b(message: str,
@@ -79,7 +78,11 @@ def chat_llama3_1_8b(message: str,
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
@@ -87,14 +90,11 @@ def chat_llama3_1_8b(message: str,
         input_ids= input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
-        do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
-    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
-    if temperature == 0:
-        generate_kwargs['do_sample'] = False
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
@@ -148,4 +148,3 @@ with gr.Blocks(fill_height=True, css=css) as demo:
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import os
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 TITLE = '''
 <h1 style="text-align: center;">Meta Llama3.1 8B <a href="https://huggingface.co/spaces/ysharma/Chat_with_Meta_llama3_1_8b?duplicate=true" id="duplicate-button"><button style="color:white">Duplicate this Space</button></a></h1>
 }
 """
+model_id = "llhf/Meta-Llama-3.1-8B-Instruct"
 # Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
+MAX_INPUT_TOKEN_LENGTH = 4096
 # Gradio inference function
 @spaces.GPU(duration=120)
 def chat_llama3_1_8b(message: str,
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
         input_ids= input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
+        do_sample=temperature != 0,  # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
         temperature=temperature,
         eos_token_id=terminators,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
 if __name__ == "__main__":
     demo.launch()