Spaces:
Running
on
L40S
Running
on
L40S
enable fp16 inference
Browse files- app.py +1 -1
- utils/dc_utils.py +2 -0
- video_depth_anything/dpt_temporal.py +4 -2
- video_depth_anything/video_depth.py +2 -1
app.py
CHANGED
@@ -128,7 +128,7 @@ def construct_demo():
|
|
128 |
label="max process length",
|
129 |
minimum=-1,
|
130 |
maximum=1000,
|
131 |
-
value
|
132 |
step=1,
|
133 |
)
|
134 |
target_fps = gr.Slider(
|
|
|
128 |
label="max process length",
|
129 |
minimum=-1,
|
130 |
maximum=1000,
|
131 |
+
value=500,
|
132 |
step=1,
|
133 |
)
|
134 |
target_fps = gr.Slider(
|
utils/dc_utils.py
CHANGED
@@ -22,10 +22,12 @@ def read_video_frames(video_path, process_length, target_fps=-1, max_res=-1):
|
|
22 |
original_height, original_width = vid.get_batch([0]).shape[1:3]
|
23 |
height = original_height
|
24 |
width = original_width
|
|
|
25 |
if max_res > 0 and max(height, width) > max_res:
|
26 |
scale = max_res / max(original_height, original_width)
|
27 |
height = ensure_even(round(original_height * scale))
|
28 |
width = ensure_even(round(original_width * scale))
|
|
|
29 |
|
30 |
vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
|
31 |
|
|
|
22 |
original_height, original_width = vid.get_batch([0]).shape[1:3]
|
23 |
height = original_height
|
24 |
width = original_width
|
25 |
+
print(f'==> original video size: {original_height} x {original_width}')
|
26 |
if max_res > 0 and max(height, width) > max_res:
|
27 |
scale = max_res / max(original_height, original_width)
|
28 |
height = ensure_even(round(original_height * scale))
|
29 |
width = ensure_even(round(original_width * scale))
|
30 |
+
print(f'==> downsample video size: {height} x {width}')
|
31 |
|
32 |
vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
|
33 |
|
video_depth_anything/dpt_temporal.py
CHANGED
@@ -91,6 +91,8 @@ class DPTHeadTemporal(DPTHead):
|
|
91 |
out = F.interpolate(
|
92 |
out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
|
93 |
)
|
94 |
-
|
|
|
|
|
95 |
|
96 |
-
return out
|
|
|
91 |
out = F.interpolate(
|
92 |
out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
|
93 |
)
|
94 |
+
ori_type = out.dtype
|
95 |
+
with torch.autocast(device_type="cuda", enabled=False):
|
96 |
+
out = self.scratch.output_conv2(out.float())
|
97 |
|
98 |
+
return out.to(ori_type)
|
video_depth_anything/video_depth.py
CHANGED
@@ -104,7 +104,8 @@ class VideoDepthAnything(nn.Module):
|
|
104 |
cur_input[:, :OVERLAP, ...] = pre_input[:, KEYFRAMES, ...]
|
105 |
|
106 |
with torch.no_grad():
|
107 |
-
|
|
|
108 |
|
109 |
depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
|
110 |
depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]
|
|
|
104 |
cur_input[:, :OVERLAP, ...] = pre_input[:, KEYFRAMES, ...]
|
105 |
|
106 |
with torch.no_grad():
|
107 |
+
with torch.autocast(device_type=device, enabled=True):
|
108 |
+
depth = self.forward(cur_input) # depth shape: [1, T, H, W]
|
109 |
|
110 |
depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
|
111 |
depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]
|