SkalskiP commited on
Commit
488d99e
·
1 Parent(s): c4ec446

ready to migrate to ZERO

Browse files
Files changed (7) hide show
  1. .gitignore +2 -1
  2. app.py +276 -78
  3. requirements-local.txt +0 -10
  4. requirements.txt +1 -0
  5. utils/modes.py +11 -5
  6. utils/sam.py +10 -2
  7. utils/video.py +26 -0
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  /venv
2
- /.idea
 
 
1
  /venv
2
+ /.idea
3
+ /tmp
app.py CHANGED
@@ -1,16 +1,22 @@
 
1
  from typing import Tuple, Optional
2
 
 
3
  import gradio as gr
 
 
4
  import supervision as sv
5
  import torch
6
  from PIL import Image
 
 
7
 
8
  from utils.florence import load_florence_model, run_florence_inference, \
9
  FLORENCE_DETAILED_CAPTION_TASK, \
10
  FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
11
- from utils.modes import INFERENCE_MODES, OPEN_VOCABULARY_DETECTION, \
12
- CAPTION_GROUNDING_MASKS
13
- from utils.sam import load_sam_model, run_sam_inference
14
 
15
  MARKDOWN = """
16
  # Florence2 + SAM2 🔥
@@ -33,29 +39,46 @@ MARKDOWN = """
33
  This demo integrates Florence2 and SAM2 by creating a two-stage inference pipeline. In
34
  the first stage, Florence2 performs tasks such as object detection, open-vocabulary
35
  object detection, image captioning, or phrase grounding. In the second stage, SAM2
36
- performs object segmentation on the image. **Video segmentation will be available
37
- soon.**
38
  """
39
 
40
- EXAMPLES = [
41
- [OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'straw'],
42
- [OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'napkin'],
43
- [OPEN_VOCABULARY_DETECTION, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 'tail'],
44
- [CAPTION_GROUNDING_MASKS, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
45
- [CAPTION_GROUNDING_MASKS, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
46
  ]
47
 
 
 
 
 
48
  DEVICE = torch.device("cuda")
 
 
 
 
 
 
 
 
49
  FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
50
- SAM_MODEL = load_sam_model(device=DEVICE)
51
- BOX_ANNOTATOR = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
 
 
 
52
  LABEL_ANNOTATOR = sv.LabelAnnotator(
 
53
  color_lookup=sv.ColorLookup.INDEX,
54
  text_position=sv.Position.CENTER_OF_MASS,
55
- text_color=sv.Color.from_hex("#FFFFFF"),
56
  border_radius=5
57
  )
58
- MASK_ANNOTATOR = sv.MaskAnnotator(color_lookup=sv.ColorLookup.INDEX)
 
 
 
59
 
60
 
61
  def annotate_image(image, detections):
@@ -68,38 +91,50 @@ def annotate_image(image, detections):
68
 
69
  def on_mode_dropdown_change(text):
70
  return [
71
- gr.Textbox(visible=text == OPEN_VOCABULARY_DETECTION),
72
- gr.Textbox(visible=text == CAPTION_GROUNDING_MASKS),
73
  ]
74
 
75
 
76
- def process(
 
 
 
77
  mode_dropdown, image_input, text_input
78
  ) -> Tuple[Optional[Image.Image], Optional[str]]:
79
  if not image_input:
 
80
  return None, None
81
 
82
- if mode_dropdown == OPEN_VOCABULARY_DETECTION:
83
  if not text_input:
 
84
  return None, None
85
 
86
- _, result = run_florence_inference(
87
- model=FLORENCE_MODEL,
88
- processor=FLORENCE_PROCESSOR,
89
- device=DEVICE,
90
- image=image_input,
91
- task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
92
- text=text_input
93
- )
94
- detections = sv.Detections.from_lmm(
95
- lmm=sv.LMM.FLORENCE_2,
96
- result=result,
97
- resolution_wh=image_input.size
98
- )
99
- detections = run_sam_inference(SAM_MODEL, image_input, detections)
 
 
 
 
 
 
 
100
  return annotate_image(image_input, detections), None
101
 
102
- if mode_dropdown == CAPTION_GROUNDING_MASKS:
103
  _, result = run_florence_inference(
104
  model=FLORENCE_MODEL,
105
  processor=FLORENCE_PROCESSOR,
@@ -121,65 +156,228 @@ def process(
121
  result=result,
122
  resolution_wh=image_input.size
123
  )
124
- detections = run_sam_inference(SAM_MODEL, image_input, detections)
125
  return annotate_image(image_input, detections), caption
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  with gr.Blocks() as demo:
129
  gr.Markdown(MARKDOWN)
130
- mode_dropdown_component = gr.Dropdown(
131
- choices=INFERENCE_MODES,
132
- value=INFERENCE_MODES[0],
133
- label="Mode",
134
- info="Select a mode to use.",
135
- interactive=True
136
- )
137
- with gr.Row():
138
- with gr.Column():
139
- image_input_component = gr.Image(
140
- type='pil', label='Upload image')
141
- text_input_component = gr.Textbox(
142
- label='Text prompt')
143
- submit_button_component = gr.Button(value='Submit', variant='primary')
144
- with gr.Column():
145
- image_output_component = gr.Image(type='pil', label='Image output')
146
- text_output_component = gr.Textbox(label='Caption output', visible=False)
147
-
148
- with gr.Row():
149
- gr.Examples(
150
- fn=process,
151
- examples=EXAMPLES,
152
- inputs=[
153
- mode_dropdown_component,
154
- image_input_component,
155
- text_input_component
156
- ],
157
- outputs=[
158
- image_output_component,
159
- text_output_component
160
- ],
161
- run_on_click=True
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  )
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
- submit_button_component.click(
165
- fn=process,
166
  inputs=[
167
- mode_dropdown_component,
168
- image_input_component,
169
- text_input_component
170
  ],
171
  outputs=[
172
- image_output_component,
173
- text_output_component
174
  ]
175
  )
176
- mode_dropdown_component.change(
 
 
 
 
 
 
 
 
 
 
 
 
177
  on_mode_dropdown_change,
178
- inputs=[mode_dropdown_component],
179
  outputs=[
180
- text_input_component,
181
- text_output_component
182
  ]
183
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  demo.launch(debug=False, show_error=True)
 
1
+ import os
2
  from typing import Tuple, Optional
3
 
4
+ import cv2
5
  import gradio as gr
6
+ import numpy as np
7
+ import spaces
8
  import supervision as sv
9
  import torch
10
  from PIL import Image
11
+ from tqdm import tqdm
12
+ from utils.video import generate_unique_name, create_directory, delete_directory
13
 
14
  from utils.florence import load_florence_model, run_florence_inference, \
15
  FLORENCE_DETAILED_CAPTION_TASK, \
16
  FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
17
+ from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
18
+ IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
19
+ from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
20
 
21
  MARKDOWN = """
22
  # Florence2 + SAM2 🔥
 
39
  This demo integrates Florence2 and SAM2 by creating a two-stage inference pipeline. In
40
  the first stage, Florence2 performs tasks such as object detection, open-vocabulary
41
  object detection, image captioning, or phrase grounding. In the second stage, SAM2
42
+ performs object segmentation on the image.
 
43
  """
44
 
45
+ IMAGE_PROCESSING_EXAMPLES = [
46
+ [IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'straw, white napkin, black napkin, dog, hair, man'],
47
+ [IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 'tail'],
48
+ [IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
49
+ [IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
 
50
  ]
51
 
52
+ VIDEO_SCALE_FACTOR = 0.5
53
+ VIDEO_TARGET_DIRECTORY = "tmp"
54
+ create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
55
+
56
  DEVICE = torch.device("cuda")
57
+ # DEVICE = torch.device("cpu")
58
+
59
+ torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
60
+ if torch.cuda.get_device_properties(0).major >= 8:
61
+ torch.backends.cuda.matmul.allow_tf32 = True
62
+ torch.backends.cudnn.allow_tf32 = True
63
+
64
+
65
  FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
66
+ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
67
+ SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)
68
+ COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
69
+ COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
70
+ BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
71
  LABEL_ANNOTATOR = sv.LabelAnnotator(
72
+ color=COLOR_PALETTE,
73
  color_lookup=sv.ColorLookup.INDEX,
74
  text_position=sv.Position.CENTER_OF_MASS,
75
+ text_color=sv.Color.from_hex("#000000"),
76
  border_radius=5
77
  )
78
+ MASK_ANNOTATOR = sv.MaskAnnotator(
79
+ color=COLOR_PALETTE,
80
+ color_lookup=sv.ColorLookup.INDEX
81
+ )
82
 
83
 
84
  def annotate_image(image, detections):
 
91
 
92
  def on_mode_dropdown_change(text):
93
  return [
94
+ gr.Textbox(visible=text == IMAGE_OPEN_VOCABULARY_DETECTION_MODE),
95
+ gr.Textbox(visible=text == IMAGE_CAPTION_GROUNDING_MASKS_MODE),
96
  ]
97
 
98
 
99
+ @spaces.GPU
100
+ @torch.inference_mode()
101
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
102
+ def process_image(
103
  mode_dropdown, image_input, text_input
104
  ) -> Tuple[Optional[Image.Image], Optional[str]]:
105
  if not image_input:
106
+ gr.Info("Please upload an image.")
107
  return None, None
108
 
109
+ if mode_dropdown == IMAGE_OPEN_VOCABULARY_DETECTION_MODE:
110
  if not text_input:
111
+ gr.Info("Please enter a text prompt.")
112
  return None, None
113
 
114
+ texts = [prompt.strip() for prompt in text_input.split(",")]
115
+ detections_list = []
116
+ for text in texts:
117
+ _, result = run_florence_inference(
118
+ model=FLORENCE_MODEL,
119
+ processor=FLORENCE_PROCESSOR,
120
+ device=DEVICE,
121
+ image=image_input,
122
+ task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
123
+ text=text
124
+ )
125
+ detections = sv.Detections.from_lmm(
126
+ lmm=sv.LMM.FLORENCE_2,
127
+ result=result,
128
+ resolution_wh=image_input.size
129
+ )
130
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
131
+ detections_list.append(detections)
132
+
133
+ detections = sv.Detections.merge(detections_list)
134
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
135
  return annotate_image(image_input, detections), None
136
 
137
+ if mode_dropdown == IMAGE_CAPTION_GROUNDING_MASKS_MODE:
138
  _, result = run_florence_inference(
139
  model=FLORENCE_MODEL,
140
  processor=FLORENCE_PROCESSOR,
 
156
  result=result,
157
  resolution_wh=image_input.size
158
  )
159
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
160
  return annotate_image(image_input, detections), caption
161
 
162
 
163
+ @spaces.GPU(duration=300)
164
+ @torch.inference_mode()
165
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
166
+ def process_video(
167
+ mode_dropdown, video_input, text_input, progress=gr.Progress(track_tqdm=True)
168
+ ) -> Optional[str]:
169
+ if not video_input:
170
+ gr.Info("Please upload a video.")
171
+ return None
172
+
173
+ if not text_input:
174
+ gr.Info("Please enter a text prompt.")
175
+ return None
176
+
177
+ frame_generator = sv.get_video_frames_generator(video_input)
178
+ frame = next(frame_generator)
179
+ frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
180
+
181
+ texts = [prompt.strip() for prompt in text_input.split(",")]
182
+ detections_list = []
183
+ for text in texts:
184
+ _, result = run_florence_inference(
185
+ model=FLORENCE_MODEL,
186
+ processor=FLORENCE_PROCESSOR,
187
+ device=DEVICE,
188
+ image=frame,
189
+ task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
190
+ text=text
191
+ )
192
+ detections = sv.Detections.from_lmm(
193
+ lmm=sv.LMM.FLORENCE_2,
194
+ result=result,
195
+ resolution_wh=frame.size
196
+ )
197
+ detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
198
+ detections_list.append(detections)
199
+
200
+ detections = sv.Detections.merge(detections_list)
201
+ detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
202
+
203
+ if len(detections.mask) == 0:
204
+ gr.Info(
205
+ "No objects of class {text_input} found in the first frame of the video. "
206
+ "Trim the video to make the object appear in the first frame or try a "
207
+ "different text prompt."
208
+ )
209
+ return None
210
+
211
+ name = generate_unique_name()
212
+ frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
213
+ frames_sink = sv.ImageSink(
214
+ target_dir_path=frame_directory_path,
215
+ image_name_pattern="{:05d}.jpeg"
216
+ )
217
+
218
+ video_info = sv.VideoInfo.from_video_path(video_input)
219
+ video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR)
220
+ video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR)
221
+
222
+ frames_generator = sv.get_video_frames_generator(video_input)
223
+ with frames_sink:
224
+ for frame in tqdm(
225
+ frames_generator,
226
+ total=video_info.total_frames,
227
+ desc="splitting video into frames"
228
+ ):
229
+ frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
230
+ frames_sink.save_image(frame)
231
+
232
+ inference_state = SAM_VIDEO_MODEL.init_state(
233
+ video_path=frame_directory_path,
234
+ device=DEVICE
235
+ )
236
+
237
+ for mask_index, mask in enumerate(detections.mask):
238
+ _, object_ids, mask_logits = SAM_VIDEO_MODEL.add_new_mask(
239
+ inference_state=inference_state,
240
+ frame_idx=0,
241
+ obj_id=mask_index,
242
+ mask=mask
243
+ )
244
+
245
+ video_path = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4")
246
+ frames_generator = sv.get_video_frames_generator(video_input)
247
+ masks_generator = SAM_VIDEO_MODEL.propagate_in_video(inference_state)
248
+ with sv.VideoSink(video_path, video_info=video_info) as sink:
249
+ for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
250
+ frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
251
+ masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
252
+ if len(masks.shape) == 4:
253
+ masks = np.squeeze(masks, axis=1)
254
+
255
+ detections = sv.Detections(
256
+ xyxy=sv.mask_to_xyxy(masks=masks),
257
+ mask=masks,
258
+ class_id=np.array(tracker_ids)
259
+ )
260
+ annotated_frame = frame.copy()
261
+ annotated_frame = MASK_ANNOTATOR.annotate(
262
+ scene=annotated_frame, detections=detections)
263
+ annotated_frame = BOX_ANNOTATOR.annotate(
264
+ scene=annotated_frame, detections=detections)
265
+ sink.write_frame(annotated_frame)
266
+
267
+ delete_directory(frame_directory_path)
268
+ return video_path
269
+
270
+
271
  with gr.Blocks() as demo:
272
  gr.Markdown(MARKDOWN)
273
+ with gr.Tab("Image"):
274
+ image_processing_mode_dropdown_component = gr.Dropdown(
275
+ choices=IMAGE_INFERENCE_MODES,
276
+ value=IMAGE_INFERENCE_MODES[0],
277
+ label="Mode",
278
+ info="Select a mode to use.",
279
+ interactive=True
280
+ )
281
+ with gr.Row():
282
+ with gr.Column():
283
+ image_processing_image_input_component = gr.Image(
284
+ type='pil', label='Upload image')
285
+ image_processing_text_input_component = gr.Textbox(
286
+ label='Text prompt',
287
+ placeholder='Enter comma separated text prompts')
288
+ image_processing_submit_button_component = gr.Button(
289
+ value='Submit', variant='primary')
290
+ with gr.Column():
291
+ image_processing_image_output_component = gr.Image(
292
+ type='pil', label='Image output')
293
+ image_processing_text_output_component = gr.Textbox(
294
+ label='Caption output', visible=False)
295
+
296
+ with gr.Row():
297
+ gr.Examples(
298
+ fn=process_image,
299
+ examples=IMAGE_PROCESSING_EXAMPLES,
300
+ inputs=[
301
+ image_processing_mode_dropdown_component,
302
+ image_processing_image_input_component,
303
+ image_processing_text_input_component
304
+ ],
305
+ outputs=[
306
+ image_processing_image_output_component,
307
+ image_processing_text_output_component
308
+ ],
309
+ run_on_click=True
310
+ )
311
+ with gr.Tab("Video"):
312
+ video_processing_mode_dropdown_component = gr.Dropdown(
313
+ choices=VIDEO_INFERENCE_MODES,
314
+ value=VIDEO_INFERENCE_MODES[0],
315
+ label="Mode",
316
+ info="Select a mode to use.",
317
+ interactive=True
318
  )
319
+ with gr.Row():
320
+ with gr.Column():
321
+ video_processing_video_input_component = gr.Video(
322
+ label='Upload video')
323
+ video_processing_text_input_component = gr.Textbox(
324
+ label='Text prompt',
325
+ placeholder='Enter comma separated text prompts')
326
+ video_processing_submit_button_component = gr.Button(
327
+ value='Submit', variant='primary')
328
+ with gr.Column():
329
+ video_processing_video_output_component = gr.Video(
330
+ label='Video output')
331
 
332
+ image_processing_submit_button_component.click(
333
+ fn=process_image,
334
  inputs=[
335
+ image_processing_mode_dropdown_component,
336
+ image_processing_image_input_component,
337
+ image_processing_text_input_component
338
  ],
339
  outputs=[
340
+ image_processing_image_output_component,
341
+ image_processing_text_output_component
342
  ]
343
  )
344
+ image_processing_text_input_component.submit(
345
+ fn=process_image,
346
+ inputs=[
347
+ image_processing_mode_dropdown_component,
348
+ image_processing_image_input_component,
349
+ image_processing_text_input_component
350
+ ],
351
+ outputs=[
352
+ image_processing_image_output_component,
353
+ image_processing_text_output_component
354
+ ]
355
+ )
356
+ image_processing_mode_dropdown_component.change(
357
  on_mode_dropdown_change,
358
+ inputs=[image_processing_mode_dropdown_component],
359
  outputs=[
360
+ image_processing_text_input_component,
361
+ image_processing_text_output_component
362
  ]
363
  )
364
+ video_processing_submit_button_component.click(
365
+ fn=process_video,
366
+ inputs=[
367
+ video_processing_mode_dropdown_component,
368
+ video_processing_video_input_component,
369
+ video_processing_text_input_component
370
+ ],
371
+ outputs=video_processing_video_output_component
372
+ )
373
+ video_processing_text_input_component.submit(
374
+ fn=process_video,
375
+ inputs=[
376
+ video_processing_mode_dropdown_component,
377
+ video_processing_video_input_component,
378
+ video_processing_text_input_component
379
+ ],
380
+ outputs=video_processing_video_output_component
381
+ )
382
 
383
  demo.launch(debug=False, show_error=True)
requirements-local.txt DELETED
@@ -1,10 +0,0 @@
1
- torch
2
- einops
3
- spaces
4
- timm
5
- transformers
6
- samv2
7
- gradio
8
- supervision
9
- opencv-python
10
- pytest
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  einops
2
  spaces
3
  timm
 
1
+ tqdm
2
  einops
3
  spaces
4
  timm
utils/modes.py CHANGED
@@ -1,7 +1,13 @@
1
- OPEN_VOCABULARY_DETECTION = "open vocabulary detection + masks"
2
- CAPTION_GROUNDING_MASKS = "caption + grounding + masks"
3
 
4
- INFERENCE_MODES = [
5
- OPEN_VOCABULARY_DETECTION,
6
- CAPTION_GROUNDING_MASKS
 
 
 
 
 
 
7
  ]
 
1
+ IMAGE_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + image masks"
2
+ IMAGE_CAPTION_GROUNDING_MASKS_MODE = "caption + grounding + image masks"
3
 
4
+ IMAGE_INFERENCE_MODES = [
5
+ IMAGE_OPEN_VOCABULARY_DETECTION_MODE,
6
+ IMAGE_CAPTION_GROUNDING_MASKS_MODE
7
+ ]
8
+
9
+ VIDEO_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + video masks"
10
+
11
+ VIDEO_INFERENCE_MODES = [
12
+ VIDEO_OPEN_VOCABULARY_DETECTION_MODE
13
  ]
utils/sam.py CHANGED
@@ -4,14 +4,14 @@ import numpy as np
4
  import supervision as sv
5
  import torch
6
  from PIL import Image
7
- from sam2.build_sam import build_sam2
8
  from sam2.sam2_image_predictor import SAM2ImagePredictor
9
 
10
  SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
11
  SAM_CONFIG = "sam2_hiera_s.yaml"
12
 
13
 
14
- def load_sam_model(
15
  device: torch.device,
16
  config: str = SAM_CONFIG,
17
  checkpoint: str = SAM_CHECKPOINT
@@ -20,6 +20,14 @@ def load_sam_model(
20
  return SAM2ImagePredictor(sam_model=model)
21
 
22
 
 
 
 
 
 
 
 
 
23
  def run_sam_inference(
24
  model: Any,
25
  image: Image,
 
4
  import supervision as sv
5
  import torch
6
  from PIL import Image
7
+ from sam2.build_sam import build_sam2, build_sam2_video_predictor
8
  from sam2.sam2_image_predictor import SAM2ImagePredictor
9
 
10
  SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
11
  SAM_CONFIG = "sam2_hiera_s.yaml"
12
 
13
 
14
+ def load_sam_image_model(
15
  device: torch.device,
16
  config: str = SAM_CONFIG,
17
  checkpoint: str = SAM_CHECKPOINT
 
20
  return SAM2ImagePredictor(sam_model=model)
21
 
22
 
23
+ def load_sam_video_model(
24
+ device: torch.device,
25
+ config: str = SAM_CONFIG,
26
+ checkpoint: str = SAM_CHECKPOINT
27
+ ) -> Any:
28
+ return build_sam2_video_predictor(config, checkpoint, device=device)
29
+
30
+
31
  def run_sam_inference(
32
  model: Any,
33
  image: Image,
utils/video.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+ import shutil
4
+ import uuid
5
+
6
+
7
+ def create_directory(directory_path: str) -> None:
8
+ if not os.path.exists(directory_path):
9
+ os.makedirs(directory_path)
10
+
11
+
12
+ def delete_directory(directory_path: str) -> None:
13
+ if not os.path.exists(directory_path):
14
+ raise FileNotFoundError(f"Directory '{directory_path}' does not exist.")
15
+
16
+ try:
17
+ shutil.rmtree(directory_path)
18
+ except PermissionError:
19
+ raise PermissionError(
20
+ f"Permission denied: Unable to delete '{directory_path}'.")
21
+
22
+
23
+ def generate_unique_name():
24
+ current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
25
+ unique_id = uuid.uuid4()
26
+ return f"{current_datetime}_{unique_id}"