File size: 7,700 Bytes
55ac26c
28da247
55ac26c
28da247
 
 
 
55ac26c
 
bddb8a1
55ac26c
 
28da247
55ac26c
31acc4c
6596772
 
 
9ee2acc
 
 
 
 
 
55ac26c
 
 
 
28da247
55ac26c
 
 
 
 
 
 
 
 
 
28da247
55ac26c
 
28da247
55ac26c
28da247
 
 
55ac26c
 
 
 
 
864aa62
55ac26c
 
83f9422
55ac26c
 
 
 
 
 
 
28da247
 
 
55ac26c
28da247
55ac26c
28da247
bddb8a1
 
55ac26c
 
 
 
28da247
 
 
55ac26c
bddb8a1
 
28da247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bddb8a1
 
55ac26c
28da247
 
 
55ac26c
 
 
 
 
 
28da247
55ac26c
 
 
28da247
55ac26c
 
28da247
 
 
55ac26c
 
28da247
 
 
 
 
 
 
 
55ac26c
 
 
 
 
28da247
 
55ac26c
 
 
 
 
 
28da247
 
55ac26c
 
 
 
 
 
 
28da247
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import gc
import torch
import cv2  # Wird für die Bildverarbeitung (z.B. hconcat, GaussianBlur) benötigt
import gradio as gr
import numpy as np
import matplotlib.cm as cm

from video_depth_anything.video_depth import VideoDepthAnything
from utils.dc_utils import read_video_frames, save_video
from huggingface_hub import hf_hub_download

# Examples for the Gradio Demo
examples = [
    ['assets/example_videos/davis_rollercoaster.mp4', -1, -1, 1280],
    ['assets/example_videos/Tokyo-Walk_rgb.mp4', -1, -1, 1280],
    ['assets/example_videos/4158877-uhd_3840_2160_30fps_rgb.mp4', -1, -1, 1280],
    ['assets/example_videos/4511004-uhd_3840_2160_24fps_rgb.mp4', -1, -1, 1280],
    ['assets/example_videos/1753029-hd_1920_1080_30fps.mp4', -1, -1, 1280],
    ['assets/example_videos/davis_burnout.mp4', -1, -1, 1280],
    ['assets/example_videos/example_5473765-l.mp4', -1, -1, 1280],
    ['assets/example_videos/Istanbul-26920.mp4', -1, -1, 1280],
    ['assets/example_videos/obj_1.mp4', -1, -1, 1280],
    ['assets/example_videos/sheep_cut1.mp4', -1, -1, 1280],
]

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Model configuration
model_configs = {
    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
}

encoder2name = {
    'vits': 'Small',
    'vitl': 'Large',
}

encoder = 'vitl'
model_name = encoder2name[encoder]

# Initialize the model
video_depth_anything = VideoDepthAnything(**model_configs[encoder])
filepath = hf_hub_download(repo_id=f"depth-anything/Video-Depth-Anything-{model_name}", 
                           filename=f"video_depth_anything_{encoder}.pth", 
                           repo_type="model")
video_depth_anything.load_state_dict(torch.load(filepath, map_location='cpu'))
video_depth_anything = video_depth_anything.to(DEVICE).eval()

title = "# Video Depth Anything"
description = """Official demo for **Video Depth Anything**.
Please refer to our [paper](https://arxiv.org/abs/2501.12375), [project page](https://videodepthanything.github.io/), and [github](https://github.com/DepthAnything/Video-Depth-Anything) for more details."""


@gr.processing_utils.threaded  # alternativ kann spaces.GPU genutzt werden, falls verfügbar
def infer_video_depth(
    input_video: str,
    max_len: int = -1,
    target_fps: int = -1,
    max_res: int = 1280,
    output_dir: str = './outputs',
    input_size: int = 518,
    stitch: bool = False,
    grayscale: bool = False,
    blur: float = 0.0,
):
    # Read input video frames
    frames, target_fps = read_video_frames(input_video, max_len, target_fps, max_res)
    # Infer depths using the model
    depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device=DEVICE)

    video_name = os.path.basename(input_video)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save the processed (RGB) video and the depth visualization
    processed_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_src.mp4')
    depth_vis_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_vis.mp4')
    save_video(frames, processed_video_path, fps=fps)
    save_video(depths, depth_vis_path, fps=fps, is_depths=True)

    stitched_video_path = ""
    if stitch:
        # Create a stitched video (side-by-side): left: processed RGB, right: depth
        d_min, d_max = depths.min(), depths.max()
        stitched_frames = []
        for i in range(min(len(frames), len(depths))):
            rgb_frame = frames[i]
            depth_frame = depths[i]
            # Normalize the depth frame to [0, 255]
            depth_norm = ((depth_frame - d_min) / (d_max - d_min) * 255).astype(np.uint8)
            # Choose grayscale or colored mapping
            if grayscale:
                depth_vis = np.stack([depth_norm] * 3, axis=-1)
            else:
                cmap = cm.get_cmap("inferno")
                # cmap liefert RGBA, hier verwenden wir nur die ersten drei Kanäle
                depth_vis = (cmap(depth_norm / 255.0)[..., :3] * 255).astype(np.uint8)
            # Apply Gaussian blur if requested
            if blur > 0:
                kernel_size = int(blur * 20) * 2 + 1  # ensures odd kernel size
                depth_vis = cv2.GaussianBlur(depth_vis, (kernel_size, kernel_size), 0)
            # Concatenate side-by-side
            stitched = cv2.hconcat([rgb_frame, depth_vis])
            stitched_frames.append(stitched)
        stitched_frames = np.array(stitched_frames)
        stitched_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_stitched.mp4')
        save_video(stitched_frames, stitched_video_path, fps=fps)

    gc.collect()
    torch.cuda.empty_cache()

    # Return three outputs: processed RGB video, depth visualization, and (optionally) stitched video.
    # Falls stitch nicht aktiviert ist, wird ein leerer String zurückgegeben.
    return [processed_video_path, depth_vis_path, stitched_video_path]


def construct_demo():
    with gr.Blocks(analytics_enabled=False) as demo:
        gr.Markdown(title)
        gr.Markdown(description)
        gr.Markdown("### If you find this work useful, please help ⭐ the [Github Repo](https://github.com/DepthAnything/Video-Depth-Anything). Thanks for your attention!")

        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                input_video = gr.Video(label="Input Video", source="upload", type="filepath")
            with gr.Column(scale=2):
                with gr.Row(equal_height=True):
                    processed_video = gr.Video(label="Preprocessed Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
                    depth_vis_video = gr.Video(label="Generated Depth Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
                    stitched_video = gr.Video(label="Stitched RGBD Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                with gr.Accordion("Advanced Settings", open=False):
                    max_len = gr.Slider(label="Max process length", minimum=-1, maximum=1000, value=500, step=1)
                    target_fps = gr.Slider(label="Target FPS", minimum=-1, maximum=30, value=15, step=1)
                    max_res = gr.Slider(label="Max side resolution", minimum=480, maximum=1920, value=1280, step=1)
                    stitch_option = gr.Checkbox(label="Stitch RGB & Depth Videos", value=False)
                    grayscale_option = gr.Checkbox(label="Output Depth as Grayscale", value=False)
                    blur_slider = gr.Slider(minimum=0, maximum=1, step=0.01, label="Depth Blur Factor", value=0)
                generate_btn = gr.Button("Generate")
            with gr.Column(scale=2):
                pass

        gr.Examples(
            examples=examples,
            inputs=[input_video, max_len, target_fps, max_res, stitch_option, grayscale_option, blur_slider],
            outputs=[processed_video, depth_vis_video, stitched_video],
            fn=infer_video_depth,
            cache_examples="lazy",
        )

        generate_btn.click(
            fn=infer_video_depth,
            inputs=[input_video, max_len, target_fps, max_res, stitch_option, grayscale_option, blur_slider],
            outputs=[processed_video, depth_vis_video, stitched_video],
        )

    return demo

if __name__ == "__main__":
    demo = construct_demo()
    demo.queue()
    demo.launch(share=True)