import gradio as gr from llama_cpp import Llama import torch import os import huggingface_hub import re # Constants MODEL_REPO = "Marcus-Arcadius/nephra_v1.0-Q8_0-GGUF" MODEL_FILENAME = "nephra_v1.0-q8_0.gguf" MODEL_PATH = os.path.join(os.path.dirname(__file__), MODEL_FILENAME) def clean_response(text): """Clean up model response by removing artifacts and duplicates""" # Remove [INST], [/INST], and similar tags text = re.sub(r'\[/?INST\]', '', text) # Remove code blocks that contain 'greet()' text = re.sub(r'```python.*?greet\(\).*?```', '', text, flags=re.DOTALL) # Remove remaining code blocks text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) # Remove "A:" prefixes text = re.sub(r'A:', '', text) # Remove duplicate responses text = re.sub(r'(Hello!.*?about\?)\s*\1', r'\1', text, flags=re.DOTALL) # Remove <> and ~~ markers text = re.sub(r'[<>~]', '', text) # Clean up extra whitespace text = re.sub(r'\s+', ' ', text).strip() return text def format_prompt(message, system_prompt): """Format the prompt in the way the model expects""" return f"<|system|>{system_prompt}<|user|>{message}<|assistant|>" def download_model(): if not os.path.exists(MODEL_PATH): print(f"Downloading model from {MODEL_REPO}...") try: huggingface_hub.hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILENAME, local_dir=os.path.dirname(__file__), local_dir_use_symlinks=False ) print("Model downloaded successfully!") except Exception as e: print(f"Error downloading model: {str(e)}") raise def initialize_model(): download_model() return Llama( model_path=MODEL_PATH, n_ctx=2048, n_gpu_layers=-1 if torch.cuda.is_available() else 0 ) print("Starting model initialization...") model = initialize_model() print("Model initialized successfully!") def generate_response(message, system_prompt, temperature, max_tokens, top_p, presence_penalty, frequency_penalty, history): """Generate a response using the Nephra model""" # Format all previous conversation into context context = "" for human, assistant in history: context += f"<|user|>{human}<|assistant|>{assistant}" # Add current message to prompt full_prompt = f"{format_prompt(message, system_prompt)}" # Generate response response = model.create_completion( prompt=full_prompt, temperature=temperature, max_tokens=max_tokens, top_p=top_p, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, stop=["", "<|user|>"] # Stop at end of response or new user message ) # Clean up the response assistant_message = clean_response(response["choices"][0]["text"]) history.append((message, assistant_message)) return "", history # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("# Nephra v1.0 Chatbot (Average Time: 35s)") with gr.Row(): with gr.Column(scale=2): chatbot = gr.Chatbot(height=600) msg = gr.Textbox(label="Message", placeholder="Type your message here...") clear = gr.Button("Clear") with gr.Column(scale=1): system_prompt = gr.Textbox( label="System Prompt", value="You are a helpful AI assistant.", lines=3 ) with gr.Accordion("Model Parameters", open=False): temperature = gr.Slider(0.0, 2.0, value=0.7, label="Temperature") max_tokens = gr.Slider(50, 4096, value=1024, step=1, label="Max Tokens") top_p = gr.Slider(0.0, 1.0, value=0.9, label="Top P") presence_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Presence Penalty") frequency_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Frequency Penalty") # Set up event handlers msg.submit( generate_response, [msg, system_prompt, temperature, max_tokens, top_p, presence_penalty, frequency_penalty, chatbot], [msg, chatbot] ) clear.click(lambda: None, None, chatbot, queue=False) demo.launch()