File size: 8,640 Bytes
55ac26c
28da247
55ac26c
19217bc
28da247
 
 
55ac26c
 
bddb8a1
55ac26c
 
033fb37
55ac26c
46294a9
 
 
 
 
 
 
 
 
 
55ac26c
 
033fb37
55ac26c
 
033fb37
55ac26c
 
 
 
 
 
 
 
28da247
55ac26c
 
28da247
55ac26c
19217bc
 
 
 
 
55ac26c
 
 
 
 
864aa62
55ac26c
 
 
 
 
 
28da247
 
 
033fb37
46294a9
 
55ac26c
033fb37
55ac26c
033fb37
bddb8a1
 
55ac26c
 
 
 
033fb37
28da247
 
55ac26c
bddb8a1
 
033fb37
28da247
033fb37
 
 
28da247
 
033fb37
 
28da247
033fb37
28da247
033fb37
28da247
 
 
 
 
033fb37
28da247
4423b71
28da247
033fb37
 
 
 
 
28da247
 
 
 
 
bddb8a1
 
55ac26c
033fb37
28da247
55ac26c
 
 
 
 
28da247
19217bc
55ac26c
 
033fb37
4423b71
55ac26c
 
28da247
 
 
19217bc
55ac26c
 
28da247
 
 
 
 
 
 
 
55ac26c
 
19217bc
55ac26c
 
28da247
 
55ac26c
4423b71
 
55ac26c
19217bc
55ac26c
 
28da247
 
55ac26c
19217bc
55ac26c
 
 
 
6cc4593
9c76f77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
import gc
import torch
import cv2
import gradio as gr
import numpy as np
import matplotlib.cm as cm

from video_depth_anything.video_depth import VideoDepthAnything
from utils.dc_utils import read_video_frames, save_video
from huggingface_hub import hf_hub_download

# Examples for the Gradio Demo (the additional parameters: stitch, grayscale, blur are appended)
examples = [
    ['assets/example_videos/davis_rollercoaster.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/Tokyo-Walk_rgb.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/4158877-uhd_3840_2160_30fps_rgb.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/4511004-uhd_3840_2160_24fps_rgb.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/1753029-hd_1920_1080_30fps.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/davis_burnout.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/example_5473765-l.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/Istanbul-26920.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/obj_1.mp4', -1, -1, 1280, False, False, 0],
    ['assets/example_videos/sheep_cut1.mp4', -1, -1, 1280, False, False, 0],
]

# Determine the device: use GPU if available, else CPU.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Model configuration for different encoder variants.
model_configs = {
    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
}
encoder2name = {
    'vits': 'Small',
    'vitl': 'Large',
}
encoder = 'vitl'
model_name = encoder2name[encoder]

# Initialize the model
video_depth_anything = VideoDepthAnything(**model_configs[encoder])
filepath = hf_hub_download(
    repo_id=f"depth-anything/Video-Depth-Anything-{model_name}",
    filename=f"video_depth_anything_{encoder}.pth",
    repo_type="model"
)
video_depth_anything.load_state_dict(torch.load(filepath, map_location='cpu'))
video_depth_anything = video_depth_anything.to(DEVICE).eval()

title = "# Video Depth Anything"
description = """Official demo for **Video Depth Anything**.
Please refer to our [paper](https://arxiv.org/abs/2501.12375), [project page](https://videodepthanything.github.io/), and [github](https://github.com/DepthAnything/Video-Depth-Anything) for more details."""

def infer_video_depth(
    input_video: str,
    max_len: int = -1,
    target_fps: int = -1,
    max_res: int = 1280,
    stitch: bool = False,
    grayscale: bool = False,
    blur: float = 0.0,
    *,  # The following parameters are keyword-only and cannot be overridden by UI input.
    output_dir: str = './outputs',
    input_size: int = 518,
):
    # Read input video frames with the given maximum resolution (max_res) for inference.
    frames, target_fps = read_video_frames(input_video, max_len, target_fps, max_res)
    # Perform depth inference using the model.
    depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device=DEVICE)

    video_name = os.path.basename(input_video)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save the preprocessed (RGB) video and the depth visualization (using the default color mapping)
    processed_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_src.mp4')
    depth_vis_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_vis.mp4')
    save_video(frames, processed_video_path, fps=fps)
    save_video(depths, depth_vis_path, fps=fps, is_depths=True)

    stitched_video_path = None
    if stitch:
        # For stitching: read the original video in full resolution (without downscaling)
        full_frames, _ = read_video_frames(input_video, max_len, target_fps, max_res=-1)
        # For each frame, create a visual depth image from the inferenced depth maps (which are in the downscaled resolution)
        d_min, d_max = depths.min(), depths.max()
        stitched_frames = []
        for i in range(min(len(full_frames), len(depths))):
            rgb_full = full_frames[i]  # Full-resolution RGB frame
            depth_frame = depths[i]
            # Normalize the depth frame to the range [0, 255]
            depth_norm = ((depth_frame - d_min) / (d_max - d_min) * 255).astype(np.uint8)
            # Create either a grayscale image or apply the inferno colormap, depending on the setting.
            if grayscale:
                depth_vis = np.stack([depth_norm] * 3, axis=-1)
            else:
                cmap = cm.get_cmap("inferno")
                depth_vis = (cmap(depth_norm / 255.0)[..., :3] * 255).astype(np.uint8)
            # Apply Gaussian blur if requested (if blur factor > 0)
            if blur > 0:
                kernel_size = int(blur * 20) * 2 + 1  # ensures an odd kernel size
                depth_vis = cv2.GaussianBlur(depth_vis, (kernel_size, kernel_size), 0)
            # Resize the depth visual image to match the full-resolution RGB frame.
            H_full, W_full = rgb_full.shape[:2]
            depth_vis_resized = cv2.resize(depth_vis, (W_full, H_full))
            # Concatenate the full-resolution RGB frame (left) and the resized depth visual (right) side-by-side.
            stitched = cv2.hconcat([rgb_full, depth_vis_resized])
            stitched_frames.append(stitched)
        stitched_frames = np.array(stitched_frames)
        stitched_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_stitched.mp4')
        save_video(stitched_frames, stitched_video_path, fps=fps)

    gc.collect()
    torch.cuda.empty_cache()

    # Return the processed RGB video, depth visualization, and (if created) the stitched video.
    return [processed_video_path, depth_vis_path, stitched_video_path]

def construct_demo():
    with gr.Blocks(analytics_enabled=False) as demo:
        gr.Markdown(title)
        gr.Markdown(description)
        gr.Markdown("### If you find this work useful, please help ⭐ the [Github Repo](https://github.com/DepthAnything/Video-Depth-Anything). Thanks for your attention!")
        
        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                # Use the Video component for file upload (without specifying 'source')
                input_video = gr.Video(label="Input Video")
            with gr.Column(scale=2):
                with gr.Row(equal_height=True):
                    processed_video = gr.Video(label="Preprocessed Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
                    depth_vis_video = gr.Video(label="Generated Depth Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
                    stitched_video = gr.Video(label="Stitched RGBD Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
                    
        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                with gr.Accordion("Advanced Settings", open=False):
                    max_len = gr.Slider(label="Max process length", minimum=-1, maximum=1000, value=500, step=1)
                    target_fps = gr.Slider(label="Target FPS", minimum=-1, maximum=30, value=15, step=1)
                    max_res = gr.Slider(label="Max side resolution", minimum=480, maximum=1920, value=1280, step=1)
                    stitch_option = gr.Checkbox(label="Stitch RGB & Depth Videos", value=False)
                    grayscale_option = gr.Checkbox(label="Output Depth as Grayscale", value=False)
                    blur_slider = gr.Slider(minimum=0, maximum=1, step=0.01, label="Depth Blur Factor", value=0)
                generate_btn = gr.Button("Generate")
            with gr.Column(scale=2):
                pass
        
        gr.Examples(
            examples=examples,
            inputs=[input_video, max_len, target_fps, max_res, stitch_option, grayscale_option, blur_slider],
            outputs=[processed_video, depth_vis_video, stitched_video],
            fn=infer_video_depth,
            cache_examples=True,
            cache_mode="lazy",
        )
        
        generate_btn.click(
            fn=infer_video_depth,
            inputs=[input_video, max_len, target_fps, max_res, stitch_option, grayscale_option, blur_slider],
            outputs=[processed_video, depth_vis_video, stitched_video],
        )
    
    return demo

if __name__ == "__main__":
    demo = construct_demo()
    demo.queue()  # Enable asynchronous processing
    demo.launch(share=True)