Spaces:

GSridhar1982
/

QA_Llama31_Quantized_GGUF

Sleeping

Update app.py

e682ac6 verified about 2 months ago

1.11 kB

	import gradio as gr
	import subprocess

	from llama_cpp import Llama

	# Initialize the model
	model = Llama(model_path="QA_llama31_unsloth.Q4_K_M.gguf")

	def generate_response(prompt):
	response = model.create_chat_completion(messages=[{"role": "user", "content": prompt}])
	return response['choices'][0]['message']['content']

	# Define the inference function using llama.cpp
	def predict(text):
	# Call llama.cpp with the input text
	result = subprocess.run(
	["./llama.cpp/main", "-m", "QA_llama31_unsloth.Q4_K_M.gguf", "-p", text],
	capture_output=True,
	text=True
	)
	return result.stdout

	# Create a Gradio interface
	#iface = gr.Interface(
	# fn=generate_response,
	# inputs=gr.Textbox(lines=2, placeholder="Enter question here..."),
	# outputs="Answer",
	#)

	iface = gr.Interface(
	fn=generate_response,
	inputs="textbox",
	outputs="text",
	title="AIML Q&A Chatbot",
	description="Ask questions related to AIML and get answers from the fine-tuned Llama model."
	)

	# Launch the app
	iface.launch()