# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # # This space is created by SANJOG GHONGE for testing and learning purpose. # # If you want to remove this space or credits please contact me on my email id [ghongesanjog@gmail.com]. # # Citation : @misc{qvq-72b-preview, # title = {QVQ: To See the World with Wisdom}, # url = {https://qwenlm.github.io/blog/qvq-72b-preview/}, # author = {Qwen Team}, # month = {December}, # year = {2024} # } # @article{Qwen2VL, # title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution}, # author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai, # Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang, # Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou, # Jingren and Lin, Junyang}, # journal={arXiv preprint arXiv:2409.12191}, # year={2024} # } # # ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig from qwen_vl_utils import process_vision_info import gradio as gr from PIL import Image import torch # Create a configuration for quantization quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) # Load the model and processor model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/QVQ-72B-Preview", device_map="auto", quantization_config=quantization_config, offload_folder="offload", ) processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview") # Function to process the image and question def process_image_and_question(image, question): if image is None or question.strip() == "": return "Please provide both an image and a question." # Prepare the input message messages = [ { "role": "system", "content": [ # {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."} {"type": "text", "text": "You are helpful assistant, you give answer in JSON"} ], }, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": question}, ], } ] # Process the inputs text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") # Generate the output generated_ids = model.generate(**inputs, max_new_tokens=8192) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) print(output_text[0] if output_text else "No output generated.") return output_text[0] if output_text else "No output generated." # Define the Gradio interface with gr.Blocks() as demo: gr.Markdown("# Sanjog Test : Image and Question Answering\nProvide an image (JPG/PNG) and a related question to get an answer.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)") question_input = gr.Textbox(label="Enter your question") with gr.Column(): output_box = gr.Textbox(label="Result", interactive=False) with gr.Row(): clear_button = gr.Button("Clear") submit_button = gr.Button("Submit") # Define button functionality clear_button.click(lambda: (None, "", ""), inputs=[], outputs=[image_input, question_input, output_box]) submit_button.click(process_image_and_question, inputs=[image_input, question_input], outputs=output_box) # Launch the interface demo.launch() print(torch.cuda.memory_summary())