import gradio as gr from gradio_client import Client, handle_file from huggingface_hub import InferenceClient from PIL import Image from threading import Thread import time # Initialize clients for Moondream and QwQ moondream_client = Client("vikhyatk/moondream2") qwq_client = InferenceClient("Qwen/QwQ-32B-Preview") # Function to describe the image using Moondream API def describe_image(image, user_message): result = moondream_client.predict( img=handle_file(image), prompt="Describe this image.", api_name="/answer_question" ) description = result user_message = description + "\n" + user_message # Using QwQ model for conversation after description qwq_result = qwq_client.chat_completion( messages=[{"role": "user", "content": user_message}], max_tokens=512, temperature=0.7, top_p=0.95 ) return qwq_result['choices'][0]['message']['content'] # Function to handle chat or image-based conversation def chat_or_image(message, history, max_new_tokens=250): txt = message["text"] ext_buffer = f"{txt}" messages = [] images = [] # Process the conversation history for i, msg in enumerate(history): if isinstance(msg[0], tuple): messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]}) messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]}) images.append(Image.open(msg[0][0]).convert("RGB")) elif isinstance(msg[0], str) and isinstance(history[i-1][0], str): # text only turn messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]}) messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]}) # Add current message if len(message["files"]) == 1: if isinstance(message["files"][0], str): # Example images image = Image.open(message["files"][0]).convert("RGB") else: # Regular image input image = Image.open(message["files"][0]["path"]).convert("RGB") images.append(image) messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]}) else: messages.append({"role": "user", "content": [{"type": "text", "text": txt}]}) # Processing the conversation to send to the model texts = moondream_client.apply_chat_template(messages, add_generation_prompt=True) if images == []: inputs = moondream_client(text=texts, return_tensors="pt").to("cuda") else: inputs = moondream_client(text=texts, images=images, return_tensors="pt").to("cuda") streamer = TextIteratorStreamer(moondream_client, skip_special_tokens=True, skip_prompt=True) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens) generated_text = "" # Generating the response with threading to avoid blocking thread = Thread(target=qwq_client.chat_completion, kwargs=generation_kwargs) thread.start() buffer = "" # Stream the generated text for new_text in streamer: buffer += new_text generated_text_without_prompt = buffer time.sleep(0.01) yield buffer # Gradio Interface setup demo = gr.Interface( fn=chat_or_image, inputs=[ gr.Image(type="filepath", label="Upload image (Optional)"), gr.Textbox(label="Ask anything", placeholder="Ask...", lines=2) ], outputs="text", title="Multimodal Llama Chatbot", description="Interact with the Llama chatbot. Upload an image, ask a question, or both!", live=True ) if __name__ == "__main__": demo.launch(show_error=True)