from PIL import Image import requests import torch from torchvision import io from typing import Dict from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor import spaces class inputParent(): def __init__(self, source_path, raw_data): self.sourcePath = source_path self.rawData = raw_data def __call__(self): return self.rawData class imageInput(inputParent): def __init__(self, source_path, raw_data): super().__init__(source_path, raw_data) class videoInput(inputParent): def __init__(self, source_path, raw_data): super().__init__(source_path, raw_data) class textInput(inputParent): def __init__(self, source_path, raw_data): super().__init__(source_path, raw_data) class QwenVLModel(): def __init__(self, model = 'Qwen/Qwen2-VL-7B-Instruct', device_map = 'auto'): self.modelName = model self.deviceMap = device_map self.model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto") self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") self.conversation = [] self.verbose = True def addToConversation(self, inputs, role='user'): self.conversation.append( { 'role': role, 'content': [] } ) for _input in inputs: if _input is imageInput: self.conversation[-1][ 'content' ].append( { 'type': 'image' } ) if _input is videoInput: self.conversation[-1][ 'content' ].append( { 'type': 'video' } ) if _input is textInput: self.conversation[-1][ 'content' ].append( { 'type': 'text', 'content': _input() } ) @spaces.GPU def oneImagecall(self, image_input: Image.Image, user_input): inputs = [imageInput(image_input), textInput(user_input)] self.addToConversation(inputs=inputs) # Preprocess the inputs text_prompt = self.processor.apply_chat_template(self.conversation, add_generation_prompt=True) # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n' inputs = self.processor(text=[text_prompt], images=[inputs[0]()], padding=True, return_tensors="pt") inputs = inputs.to('cpu') # Inference: Generation of the output output_ids = self.model.generate(**inputs, max_new_tokens=128) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) if self.verbose: print(output_text) return output_text