Spaces:
Runtime error
Runtime error
import time | |
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM | |
from transformers import AutoProcessor | |
model_id = "microsoft/Phi-3.5-vision-instruct" | |
# Note: set _attn_implementation='eager' if you don't have flash_attn installed | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
device_map = "auto", | |
trust_remote_code = True, | |
torch_dtype = torch.bfloat16, | |
_attn_implementation = 'eager' | |
) | |
device = torch.device("cpu") | |
model.to(device) | |
# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame. | |
processor = AutoProcessor.from_pretrained(model_id, | |
trust_remote_code = True, | |
num_crops = 4 | |
) | |
user_prompt = '<|user|>\n' | |
assistant_prompt = '<|assistant|>\n' | |
prompt_suffix = "<|end|>\n" | |
def call_model(raw_image = None, text_input = None): | |
prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}" | |
image = raw_image.convert("RGB") | |
inputs = processor(prompt, image, return_tensors = "pt").to("cpu:0") | |
generate_ids = model.generate(**inputs, | |
max_new_tokens = 1000, | |
eos_token_id = processor.tokenizer.eos_token_id, | |
) | |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] | |
response = processor.batch_decode(generate_ids, | |
skip_special_tokens = True, | |
clean_up_tokenization_spaces = False)[0] | |
return response | |
def get_model_memory_footprint(model_): | |
footprint = model_.get_memory_footprint() | |
return f"Footprint of the model in MBs: {footprint / 1e+6}Mb" | |
def process(raw_image, prompt): | |
print("start...") | |
start_time = time.time() | |
memory_usage = get_model_memory_footprint(model) | |
model_response = call_model(raw_image = raw_image, text_input = prompt) | |
end_time = time.time() | |
execution_time = end_time - start_time | |
execution_time_min = round((execution_time / 60), 2) | |
print(f"Execution time: {execution_time:.4f} seconds") | |
print(f"Execution time: {execution_time_min:.2f} min") | |
return memory_usage, model_response, execution_time_min | |
iface = gr.Interface(process, | |
inputs = [gr.Image(type = 'pil'), gr.Textbox(label = "What do you want to ask?")], | |
outputs = [gr.Textbox(label = "Memory usage"), gr.Textbox(label = "Model response"), | |
gr.Textbox(label = "Execution time (min)")]) | |
if __name__ == '__main__': | |
iface.launch() | |