Ayush0804 commited on
Commit
71b4dd6
·
verified ·
1 Parent(s): 7a555bd

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +61 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
+ from qwen_vl_utils import process_vision_info
3
+ import torch
4
+ import torchvision
5
+ # default: Load the model on the available device(s)
6
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
7
+ "Qwen/Qwen2-VL-72B-Instruct", torch_dtype="auto", device_map="auto"
8
+ )
9
+
10
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
11
+ # model = Qwen2VLForConditionalGeneration.from_pretrained(
12
+ # "Qwen/Qwen2-VL-72B-Instruct",
13
+ # torch_dtype=torch.bfloat16,
14
+ # attn_implementation="flash_attention_2",
15
+ # device_map="auto",
16
+ # )
17
+
18
+ # default processer
19
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct")
20
+
21
+ # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
22
+ # min_pixels = 256*28*28
23
+ # max_pixels = 1280*28*28
24
+ # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
25
+
26
+ messages = [
27
+ {
28
+ "role": "user",
29
+ "content": [
30
+ {
31
+ "type": "image",
32
+ "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
33
+ },
34
+ {"type": "text", "text": "Describe this image."},
35
+ ],
36
+ }
37
+ ]
38
+
39
+ # Preparation for inference
40
+ text = processor.apply_chat_template(
41
+ messages, tokenize=False, add_generation_prompt=True
42
+ )
43
+ image_inputs, video_inputs = process_vision_info(messages)
44
+ inputs = processor(
45
+ text=[text],
46
+ images=image_inputs,
47
+ videos=video_inputs,
48
+ padding=True,
49
+ return_tensors="pt",
50
+ )
51
+ inputs = inputs.to("cuda")
52
+
53
+ # Inference: Generation of the output
54
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
55
+ generated_ids_trimmed = [
56
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
57
+ ]
58
+ output_text = processor.batch_decode(
59
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
60
+ )
61
+ print(output_text)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ transformers
4
+ qwen_vl_utils