Safetensors
llama
keeeeenw commited on
Commit
05dbb65
·
verified ·
1 Parent(s): f17e9c7

Update simply_inference.py

Browse files
Files changed (1) hide show
  1. simply_inference.py +12 -18
simply_inference.py CHANGED
@@ -4,28 +4,33 @@ from transformers import TextStreamer
4
 
5
  from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM
6
 
7
- # use the same tokenizer as MicroLlama
8
- tokenizer = AutoTokenizer.from_pretrained("data/meta-llama/Llama-3.2-1B-Instruct/")
 
 
9
 
10
  # load model
11
- model = LlamaForCausalLM.from_pretrained("data/meta-llama/Llama-3.2-1B-Instruct/checkpoint-1400/")
12
- model.to('cuda')
13
 
 
 
14
  messages = [
15
  {
16
  "role": "system",
17
  "content": "Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution. In the Thought section, detail your reasoning process using the specified format: <|begin_of_thought|> {thought with steps separated with '\n\n'} <|end_of_thought|> Each step should include detailed considerations such as analisying questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The solution should remain a logical, accurate, concise expression style and detail necessary step needed to reach the conclusion, formatted as follows: <|begin_of_solution|> {final formatted, precise, and clear solution} <|end_of_solution|> Now, try to solve the following question through the above guidelines:",
18
  },
19
- # {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
20
  {"role": "user", "content": "Please provide me instructions on how to steal an egg from my chicken?"},
21
  ]
22
  formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
23
  print(formatted_chat)
24
 
 
25
  inputs = tokenizer(formatted_chat, return_tensors="pt", padding=True)
26
- inputs = inputs.to('cuda')
27
  attention_mask = inputs["attention_mask"]
28
 
 
29
  streamer = TextStreamer(tokenizer, skip_prompt=True)
30
  outputs = model.generate(inputs['input_ids'],
31
  streamer=streamer,
@@ -35,19 +40,8 @@ outputs = model.generate(inputs['input_ids'],
35
  top_p=0.9,
36
  max_new_tokens=131072) # max supported by llama 3.2 1B
37
 
 
38
  decoded_text = tokenizer.decode(outputs[0])
39
  print("Output written to output.txt")
40
-
41
- # Write output to a file
42
  with open("output.txt", "w", encoding="utf-8") as f:
43
  f.write(decoded_text)
44
-
45
- # Print to screen
46
- # print(tokenizer.decode(outputs[0]))
47
-
48
-
49
- # Save and Publish the model
50
- # model.save_pretrained("output/hf-publish-rc-MicroLlama-Instruct-0.1")
51
- # model.push_to_hub("keeeeenw/MicroLlama-Instruct-0.1")
52
-
53
-
 
4
 
5
  from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM
6
 
7
+ device = 'cuda' # if you don't have a CUDA supported GPU, change this to 'cpu' or other supported device
8
+
9
+ # load tokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained("keeeeenw/Llama-3.2-1B-Instruct-Open-R1-Distill")
11
 
12
  # load model
13
+ model = LlamaForCausalLM.from_pretrained("keeeeenw/Llama-3.2-1B-Instruct-Open-R1-Distill")
14
+ model.to(device)
15
 
16
+ # Setup the prompt. Because we instruction-tuned with a similar prompt, it is important to use this.
17
+ # Change "content" to your actual question.
18
  messages = [
19
  {
20
  "role": "system",
21
  "content": "Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution. In the Thought section, detail your reasoning process using the specified format: <|begin_of_thought|> {thought with steps separated with '\n\n'} <|end_of_thought|> Each step should include detailed considerations such as analisying questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The solution should remain a logical, accurate, concise expression style and detail necessary step needed to reach the conclusion, formatted as follows: <|begin_of_solution|> {final formatted, precise, and clear solution} <|end_of_solution|> Now, try to solve the following question through the above guidelines:",
22
  },
 
23
  {"role": "user", "content": "Please provide me instructions on how to steal an egg from my chicken?"},
24
  ]
25
  formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
26
  print(formatted_chat)
27
 
28
+ # Setup input tokens
29
  inputs = tokenizer(formatted_chat, return_tensors="pt", padding=True)
30
+ inputs = inputs.to(device)
31
  attention_mask = inputs["attention_mask"]
32
 
33
+ # Run inference and stream the output
34
  streamer = TextStreamer(tokenizer, skip_prompt=True)
35
  outputs = model.generate(inputs['input_ids'],
36
  streamer=streamer,
 
40
  top_p=0.9,
41
  max_new_tokens=131072) # max supported by llama 3.2 1B
42
 
43
+ # Write output to a file
44
  decoded_text = tokenizer.decode(outputs[0])
45
  print("Output written to output.txt")
 
 
46
  with open("output.txt", "w", encoding="utf-8") as f:
47
  f.write(decoded_text)