thesven commited on
Commit
964cd90
·
1 Parent(s): 8c6800e

basic setup

Browse files
Files changed (2) hide show
  1. app.py +36 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
+
6
+ model_to_use = "thesven/Llama3-8B-SFT-code_bagel-bnb-4bit"
7
+
8
+ @spaces.GPU
9
+ def start(n):
10
+ model_name_or_path = "thesven/Llama3-8B-SFT-code_bagel-bnb-4bit"
11
+
12
+ # BitsAndBytesConfig for loading the model in 4-bit precision
13
+ bnb_config = BitsAndBytesConfig(
14
+ load_in_4bit=True,
15
+ bnb_4bit_quant_type="nf4",
16
+ bnb_4bit_compute_dtype="bfloat16",
17
+ )
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ model_name_or_path,
22
+ device_map="auto",
23
+ trust_remote_code=True,
24
+ quantization_config=bnb_config
25
+ )
26
+ model.pad_token = model.config.eos_token_id
27
+
28
+ # Example response generation
29
+ input_text = "Hello, how are you?"
30
+ input_ids = tokenizer(input_text, return_tensors='pt').input_ids.cuda()
31
+ output = model.generate(inputs=input_ids, max_new_tokens=50)
32
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
33
+ return generated_text
34
+
35
+ demo = gr.Interface(fn=start, inputs=gr.Number(), outputs=gr.Text())
36
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ accelerate
2
+ bitsandbytes
3
+ transformers