import gradio as gr import subprocess from llama_cpp import Llama # Initialize the model model = Llama(model_path="QA_llama31_unsloth.Q4_K_M.gguf") def generate_response(prompt): response = model.create_chat_completion(messages=[{"role": "user", "content": prompt}]) return response['choices'][0]['message']['content'] # Define the inference function using llama.cpp def predict(text): # Call llama.cpp with the input text result = subprocess.run( ["./llama.cpp/main", "-m", "QA_llama31_unsloth.Q4_K_M.gguf", "-p", text], capture_output=True, text=True ) return result.stdout # Create a Gradio interface #iface = gr.Interface( # fn=generate_response, # inputs=gr.Textbox(lines=2, placeholder="Enter question here..."), # outputs="Answer", #) iface = gr.Interface( fn=generate_response, inputs="textbox", outputs="text", title="AIML Q&A Chatbot", description="Ask questions related to AIML and get answers from the fine-tuned Llama model." ) # Launch the app iface.launch()