Spaces:
Runtime error
Runtime error
from PIL import Image | |
import requests | |
import torch | |
from torchvision import io | |
from typing import Dict | |
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor | |
import spaces | |
class inputParent(): | |
def __init__(self, source_path, raw_data): | |
self.sourcePath = source_path | |
self.rawData = raw_data | |
def __call__(self): | |
return self.rawData | |
class imageInput(inputParent): | |
def __init__(self, source_path, raw_data): | |
super().__init__(source_path, raw_data) | |
class videoInput(inputParent): | |
def __init__(self, source_path, raw_data): | |
super().__init__(source_path, raw_data) | |
class textInput(inputParent): | |
def __init__(self, source_path, raw_data): | |
super().__init__(source_path, raw_data) | |
class QwenVLModel(): | |
def __init__(self, | |
model = 'Qwen/Qwen2-VL-7B-Instruct', | |
device_map = 'auto'): | |
self.modelName = model | |
self.deviceMap = device_map | |
self.model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto") | |
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") | |
self.conversation = [] | |
self.verbose = True | |
def addToConversation(self, inputs, role='user'): | |
self.conversation.append( | |
{ | |
'role': role, | |
'content': [] | |
} | |
) | |
for _input in inputs: | |
if _input is imageInput: | |
self.conversation[-1][ | |
'content' | |
].append( | |
{ | |
'type': 'image' | |
} | |
) | |
if _input is videoInput: | |
self.conversation[-1][ | |
'content' | |
].append( | |
{ | |
'type': 'video' | |
} | |
) | |
if _input is textInput: | |
self.conversation[-1][ | |
'content' | |
].append( | |
{ | |
'type': 'text', | |
'content': _input() | |
} | |
) | |
def oneImagecall(self, image_input: Image.Image, user_input): | |
inputs = [imageInput(image_input), textInput(user_input)] | |
self.addToConversation(inputs=inputs) | |
# Preprocess the inputs | |
text_prompt = self.processor.apply_chat_template(self.conversation, add_generation_prompt=True) | |
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n' | |
inputs = self.processor(text=[text_prompt], images=[inputs[0]()], padding=True, return_tensors="pt") | |
inputs = inputs.to('cpu') | |
# Inference: Generation of the output | |
output_ids = self.model.generate(**inputs, max_new_tokens=128) | |
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] | |
output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
if self.verbose: | |
print(output_text) | |
return output_text | |