KevinQHLin commited on
Commit
58e1599
·
verified ·
1 Parent(s): 36b4ab7

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +82 -0
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## ⭐ Quick Start
2
+
3
+ 1. Load model
4
+ ```python
5
+ import ast
6
+ import torch
7
+ from PIL import Image, ImageDraw
8
+ from qwen_vl_utils import process_vision_info
9
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
10
+
11
+ def draw_point(image_input, point=None, radius=5):
12
+ if isinstance(image_input, str):
13
+ image = Image.open(BytesIO(requests.get(image_input).content)) if image_input.startswith('http') else Image.open(image_input)
14
+ else:
15
+ image = image_input
16
+
17
+ if point:
18
+ x, y = point[0] * image.width, point[1] * image.height
19
+ ImageDraw.Draw(image).ellipse((x - radius, y - radius, x + radius, y + radius), fill='red')
20
+ display(image)
21
+ return
22
+
23
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
24
+ "showlab/ShowUI-2B",
25
+ torch_dtype=torch.bfloat16,
26
+ device_map="auto"
27
+ )
28
+
29
+ min_pixels = 256*28*28
30
+ max_pixels = 1344*28*28
31
+
32
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
33
+ ```
34
+
35
+ 2. Load screenshot and query
36
+ ```python
37
+ img_url = 'web_dbd7514b-9ca3-40cd-b09a-990f7b955da1.png'
38
+ query = "Nahant"
39
+
40
+
41
+ _SYSTEM = "Based on the screenshot of the page, I give a text description and you give its corresponding location. The coordinate represents a clickable location [x, y] for an element, which is a relative coordinate on the screenshot, scaled from 0 to 1."
42
+ messages = [
43
+ {
44
+ "role": "user",
45
+ "content": [
46
+ {"type": "text", "text": _SYSTEM},
47
+ {"type": "image", "image": img_url, "min_pixels": min_pixels, "max_pixels": max_pixels},
48
+ {"type": "text", "text": query}
49
+ ],
50
+ }
51
+ ]
52
+
53
+ text = processor.apply_chat_template(
54
+ messages, tokenize=False, add_generation_prompt=True,
55
+ )
56
+ image_inputs, video_inputs = process_vision_info(messages)
57
+ inputs = processor(
58
+ text=[text],
59
+ images=image_inputs,
60
+ videos=video_inputs,
61
+ padding=True,
62
+ return_tensors="pt",
63
+ )
64
+ inputs = inputs.to("cuda")
65
+
66
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
67
+ generated_ids_trimmed = [
68
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
69
+ ]
70
+ output_text = processor.batch_decode(
71
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
72
+ )[0]
73
+
74
+ click_xy = ast.literal_eval(output_text)
75
+ # [0.73, 0.21]
76
+
77
+ draw_point(img_url, click_xy, 10)
78
+ ```
79
+
80
+ This will visualize the grounding results like (where the red points are [x,y])
81
+
82
+ ![download](https://github.com/user-attachments/assets/8fe2783d-05b6-44e6-a26c-8718d02b56cb)