desert commited on
Commit
038ef00
·
1 Parent(s): d13f282

init inference

Browse files
Files changed (1) hide show
  1. app.py +26 -38
app.py CHANGED
@@ -1,66 +1,54 @@
1
- import os
2
- import subprocess
3
  import gradio as gr
 
 
4
  from huggingface_hub import hf_hub_download
5
 
6
  # Hugging Face repository IDs
7
  base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
8
  adapter_repo = "Mat17892/llama_lora_gguf"
9
 
10
- # Download the base model GGUF file
11
  print("Downloading base model...")
12
  base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")
13
 
14
- # Download the LoRA adapter GGUF file
15
  print("Downloading LoRA adapter...")
16
  lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")
17
 
18
- # Define the llama-cli path explicitly
19
- llama_cli_path = "./llama.cpp/build/bin/llama-cli"
20
- if not os.access(llama_cli_path, os.X_OK): # Check if the file is executable
21
- os.chmod(llama_cli_path, 0o755) # Set executable permissions
22
 
23
- # Function to run `llama-cli` with base model and adapter
24
- def run_llama_cli(prompt):
25
- print("Running inference with llama-cli...")
26
- cmd = [
27
- llama_cli_path, # Path to the llama-cli executable
28
- "-c", "2048", # Context length
29
- "-cnv", # Enable conversational mode
30
- "-m", base_model_path,
31
- "--lora", lora_adapter_path,
32
- "--prompt", prompt,
33
- ]
34
- try:
35
- process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
36
- stdout, stderr = process.communicate()
37
 
38
- if process.returncode != 0:
39
- print("Error during inference:")
40
- print(stderr.decode())
41
- return "Error: Could not generate response."
42
 
43
- return stdout.decode().strip()
44
- except Exception as e:
45
- print(f"Exception occurred: {e}")
46
- return "Error: Could not generate response."
47
-
48
- # Gradio interface
49
- def chatbot_fn(user_input, chat_history):
50
- # Build the full chat history as the prompt
51
  prompt = ""
52
  for user, ai in chat_history:
53
  prompt += f"User: {user}\nAI: {ai}\n"
54
  prompt += f"User: {user_input}\nAI:" # Add latest user input
55
 
56
- # Generate response using llama-cli
57
- response = run_llama_cli(prompt)
 
 
 
 
58
 
59
  # Update chat history
60
  chat_history.append((user_input, response))
61
  return chat_history, chat_history
62
 
63
- # Build the Gradio UI
64
  with gr.Blocks() as demo:
65
  gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
66
  chatbot = gr.Chatbot(label="Chat with the Model")
@@ -75,7 +63,7 @@ with gr.Blocks() as demo:
75
 
76
  # Link components
77
  submit_btn.click(
78
- chatbot_fn,
79
  inputs=[user_input, chat_history],
80
  outputs=[chatbot, chat_history],
81
  show_progress=True,
 
 
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from peft import PeftModel, PeftConfig
4
  from huggingface_hub import hf_hub_download
5
 
6
  # Hugging Face repository IDs
7
  base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
8
  adapter_repo = "Mat17892/llama_lora_gguf"
9
 
10
+ # Download model and adapter
11
  print("Downloading base model...")
12
  base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")
13
 
 
14
  print("Downloading LoRA adapter...")
15
  lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")
16
 
17
+ # Load the tokenizer and base model
18
+ print("Loading base model and tokenizer...")
19
+ tokenizer = AutoTokenizer.from_pretrained(base_model_path)
20
+ base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
21
 
22
+ # Load the LoRA adapter
23
+ print("Loading LoRA adapter...")
24
+ config = PeftConfig.from_pretrained(lora_adapter_path)
25
+ model = PeftModel.from_pretrained(base_model, lora_adapter_path)
 
 
 
 
 
 
 
 
 
 
26
 
27
+ print("Model is ready!")
 
 
 
28
 
29
+ # Function for inference
30
+ def chat_with_model(user_input, chat_history):
31
+ """
32
+ Generate a response from the model using the chat history and user input.
33
+ """
34
+ # Prepare the prompt
 
 
35
  prompt = ""
36
  for user, ai in chat_history:
37
  prompt += f"User: {user}\nAI: {ai}\n"
38
  prompt += f"User: {user_input}\nAI:" # Add latest user input
39
 
40
+ # Tokenize input
41
+ inputs = tokenizer(prompt, return_tensors="pt")
42
+
43
+ # Generate response
44
+ outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
45
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
 
47
  # Update chat history
48
  chat_history.append((user_input, response))
49
  return chat_history, chat_history
50
 
51
+ # Gradio UI
52
  with gr.Blocks() as demo:
53
  gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
54
  chatbot = gr.Chatbot(label="Chat with the Model")
 
63
 
64
  # Link components
65
  submit_btn.click(
66
+ chat_with_model,
67
  inputs=[user_input, chat_history],
68
  outputs=[chatbot, chat_history],
69
  show_progress=True,