Spaces:

IEIT-Yuan
/

Yuan2-2B-demo

Runtime error

App Files Files Community

IEIT-Yuan commited on Jan 15, 2024

Commit

b1e52c0

verified ·

1 Parent(s): 87f8f2a

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -29

app.py CHANGED Viewed

@@ -10,11 +10,11 @@ sys.path.append(
 from transformers import AutoModelForCausalLM,AutoTokenizer,LlamaTokenizer
 print("Creat tokenizer...")
-tokenizer = LlamaTokenizer.from_pretrained('IEITYuan/Yuan2-2B-hf', add_eos_token=False, add_bos_token=False, eos_token='<eod>')
 tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
 print("Creat model...")
-model = AutoModelForCausalLM.from_pretrained('IEITYuan/Yuan2-2B-hf', device_map='auto', torch_dtype=torch.bfloat16, trust_remote_code=True)
 # using CUDA for an optimal experience
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model = model.to(device)
@@ -31,33 +31,36 @@ class StopOnTokens(StoppingCriteria):
 # Function to generate model predictions.
 def predict(message, history):
-    history_transformer_format = history + [[message, ""]]
-    stop = StopOnTokens()
-    # Formatting the input for the model.
-    messages = "</s>".join(["</s>".join(["\n<|user|>:" + item[0], "\n<|assistant|>:" + item[1]])
-                        for item in history_transformer_format])
-    model_inputs = tokenizer([messages], return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        model_inputs,
-        streamer=streamer,
-        max_new_tokens=1024,
-        do_sample=True,
-        top_p=0.95,
-        top_k=50,
-        temperature=0.7,
-        num_beams=1,
-        stopping_criteria=StoppingCriteriaList([stop])
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()  # Starting the generation in a separate thread.
-    partial_message = ""
-    for new_token in streamer:
-        partial_message += new_token
-        if '</s>' in partial_message:  # Breaking the loop if the stop token is generated.
-            break
-        yield partial_message
 # Setting up the Gradio chat interface.

 from transformers import AutoModelForCausalLM,AutoTokenizer,LlamaTokenizer
 print("Creat tokenizer...")
+tokenizer = LlamaTokenizer.from_pretrained('IEITYuan/Yuan2-2B-Janus-hf', add_eos_token=False, add_bos_token=False, eos_token='<eod>')
 tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
 print("Creat model...")
+model = AutoModelForCausalLM.from_pretrained('IEITYuan/Yuan2-2B-Janus-hf', device_map='auto', torch_dtype=torch.bfloat16, trust_remote_code=True)
 # using CUDA for an optimal experience
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model = model.to(device)
 # Function to generate model predictions.
 def predict(message, history):
+    # history_transformer_format = history + [[message, ""]]
+    # stop = StopOnTokens()
+    #
+    # # Formatting the input for the model.
+    # messages = "</s>".join(["</s>".join(["\n<|user|>:" + item[0], "\n<|assistant|>:" + item[1]])
+    #                     for item in history_transformer_format])
+    # model_inputs = tokenizer([messages], return_tensors="pt").to(device)
+    # streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
+    # generate_kwargs = dict(
+    #     model_inputs,
+    #     streamer=streamer,
+    #     max_new_tokens=1024,
+    #     do_sample=True,
+    #     top_p=0.95,
+    #     top_k=50,
+    #     temperature=0.7,
+    #     num_beams=1,
+    #     stopping_criteria=StoppingCriteriaList([stop])
+    # )
+    # t = Thread(target=model.generate, kwargs=generate_kwargs)
+    # t.start()  # Starting the generation in a separate thread.
+    # partial_message = ""
+    # for new_token in streamer:
+    #     partial_message += new_token
+    #     if '</s>' in partial_message:  # Breaking the loop if the stop token is generated.
+    #         break
+    #     yield partial_message
+    inputs = tokenizer(message, return_tensors="pt")["input_ids"].to("cuda:0")
+    outputs = model.generate(inputs, do_sample=False, max_length=100)
+    return(tokenizer.decode(outputs[0]))
 # Setting up the Gradio chat interface.