Spaces:

depth-anything
/

Video-Depth-Anything

Running on L40S

Shane922 commited on 2 days ago

Commit

891bd26

1 Parent(s): 46a6c3d

enable fp16 inference

Files changed (4) hide show

app.py CHANGED Viewed

@@ -128,7 +128,7 @@ def construct_demo():
                             label="max process length",
                             minimum=-1,
                             maximum=1000,
-                            value=-1,
                             step=1,
                         )
                         target_fps = gr.Slider(

                             label="max process length",
                             minimum=-1,
                             maximum=1000,
+                            value=500,
                             step=1,
                         )
                         target_fps = gr.Slider(

utils/dc_utils.py CHANGED Viewed

@@ -22,10 +22,12 @@ def read_video_frames(video_path, process_length, target_fps=-1, max_res=-1):
         original_height, original_width = vid.get_batch([0]).shape[1:3]
         height = original_height
         width = original_width
         if max_res > 0 and max(height, width) > max_res:
             scale = max_res / max(original_height, original_width)
             height = ensure_even(round(original_height * scale))
             width = ensure_even(round(original_width * scale))
         vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)

         original_height, original_width = vid.get_batch([0]).shape[1:3]
         height = original_height
         width = original_width
+        print(f'==> original video size: {original_height} x {original_width}')
         if max_res > 0 and max(height, width) > max_res:
             scale = max_res / max(original_height, original_width)
             height = ensure_even(round(original_height * scale))
             width = ensure_even(round(original_width * scale))
+            print(f'==> downsample video size: {height} x {width}')
         vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)

video_depth_anything/dpt_temporal.py CHANGED Viewed

@@ -91,6 +91,8 @@ class DPTHeadTemporal(DPTHead):
         out = F.interpolate(
             out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
         )
-        out = self.scratch.output_conv2(out)
-        return out

         out = F.interpolate(
             out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
         )
+        ori_type = out.dtype
+        with torch.autocast(device_type="cuda", enabled=False):
+            out = self.scratch.output_conv2(out.float())
+        return out.to(ori_type)

video_depth_anything/video_depth.py CHANGED Viewed

@@ -104,7 +104,8 @@ class VideoDepthAnything(nn.Module):
                 cur_input[:, :OVERLAP, ...] = pre_input[:, KEYFRAMES, ...]
             with torch.no_grad():
-                depth = self.forward(cur_input) # depth shape: [1, T, H, W]
             depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
             depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]

                 cur_input[:, :OVERLAP, ...] = pre_input[:, KEYFRAMES, ...]
             with torch.no_grad():
+                with torch.autocast(device_type=device, enabled=True):
+                    depth = self.forward(cur_input) # depth shape: [1, T, H, W]
             depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
             depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]