QwenVL2Demo / qwenvl.py
LOpeetu's picture
Upload 3 files
b08e04f verified
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import spaces
class inputParent():
def __init__(self, source_path, raw_data):
self.sourcePath = source_path
self.rawData = raw_data
def __call__(self):
return self.rawData
class imageInput(inputParent):
def __init__(self, source_path, raw_data):
super().__init__(source_path, raw_data)
class videoInput(inputParent):
def __init__(self, source_path, raw_data):
super().__init__(source_path, raw_data)
class textInput(inputParent):
def __init__(self, source_path, raw_data):
super().__init__(source_path, raw_data)
class QwenVLModel():
def __init__(self,
model = 'Qwen/Qwen2-VL-7B-Instruct',
device_map = 'auto'):
self.modelName = model
self.deviceMap = device_map
self.model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto")
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
self.conversation = []
self.verbose = True
def addToConversation(self, inputs, role='user'):
self.conversation.append(
{
'role': role,
'content': []
}
)
for _input in inputs:
if _input is imageInput:
self.conversation[-1][
'content'
].append(
{
'type': 'image'
}
)
if _input is videoInput:
self.conversation[-1][
'content'
].append(
{
'type': 'video'
}
)
if _input is textInput:
self.conversation[-1][
'content'
].append(
{
'type': 'text',
'content': _input()
}
)
@spaces.GPU
def oneImagecall(self, image_input: Image.Image, user_input):
inputs = [imageInput(image_input), textInput(user_input)]
self.addToConversation(inputs=inputs)
# Preprocess the inputs
text_prompt = self.processor.apply_chat_template(self.conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
inputs = self.processor(text=[text_prompt], images=[inputs[0]()], padding=True, return_tensors="pt")
inputs = inputs.to('cpu')
# Inference: Generation of the output
output_ids = self.model.generate(**inputs, max_new_tokens=128)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
if self.verbose:
print(output_text)
return output_text