Krokodilpirat's picture
Update app.py
033fb37 verified
raw
history blame
8.64 kB
import os
import gc
import torch
import cv2
import gradio as gr
import numpy as np
import matplotlib.cm as cm
from video_depth_anything.video_depth import VideoDepthAnything
from utils.dc_utils import read_video_frames, save_video
from huggingface_hub import hf_hub_download
# Examples for the Gradio Demo (the additional parameters: stitch, grayscale, blur are appended)
examples = [
['assets/example_videos/davis_rollercoaster.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/Tokyo-Walk_rgb.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/4158877-uhd_3840_2160_30fps_rgb.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/4511004-uhd_3840_2160_24fps_rgb.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/1753029-hd_1920_1080_30fps.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/davis_burnout.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/example_5473765-l.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/Istanbul-26920.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/obj_1.mp4', -1, -1, 1280, False, False, 0],
['assets/example_videos/sheep_cut1.mp4', -1, -1, 1280, False, False, 0],
]
# Determine the device: use GPU if available, else CPU.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
# Model configuration for different encoder variants.
model_configs = {
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
}
encoder2name = {
'vits': 'Small',
'vitl': 'Large',
}
encoder = 'vitl'
model_name = encoder2name[encoder]
# Initialize the model
video_depth_anything = VideoDepthAnything(**model_configs[encoder])
filepath = hf_hub_download(
repo_id=f"depth-anything/Video-Depth-Anything-{model_name}",
filename=f"video_depth_anything_{encoder}.pth",
repo_type="model"
)
video_depth_anything.load_state_dict(torch.load(filepath, map_location='cpu'))
video_depth_anything = video_depth_anything.to(DEVICE).eval()
title = "# Video Depth Anything"
description = """Official demo for **Video Depth Anything**.
Please refer to our [paper](https://arxiv.org/abs/2501.12375), [project page](https://videodepthanything.github.io/), and [github](https://github.com/DepthAnything/Video-Depth-Anything) for more details."""
def infer_video_depth(
input_video: str,
max_len: int = -1,
target_fps: int = -1,
max_res: int = 1280,
stitch: bool = False,
grayscale: bool = False,
blur: float = 0.0,
*, # The following parameters are keyword-only and cannot be overridden by UI input.
output_dir: str = './outputs',
input_size: int = 518,
):
# Read input video frames with the given maximum resolution (max_res) for inference.
frames, target_fps = read_video_frames(input_video, max_len, target_fps, max_res)
# Perform depth inference using the model.
depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device=DEVICE)
video_name = os.path.basename(input_video)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Save the preprocessed (RGB) video and the depth visualization (using the default color mapping)
processed_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_src.mp4')
depth_vis_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_vis.mp4')
save_video(frames, processed_video_path, fps=fps)
save_video(depths, depth_vis_path, fps=fps, is_depths=True)
stitched_video_path = None
if stitch:
# For stitching: read the original video in full resolution (without downscaling)
full_frames, _ = read_video_frames(input_video, max_len, target_fps, max_res=-1)
# For each frame, create a visual depth image from the inferenced depth maps (which are in the downscaled resolution)
d_min, d_max = depths.min(), depths.max()
stitched_frames = []
for i in range(min(len(full_frames), len(depths))):
rgb_full = full_frames[i] # Full-resolution RGB frame
depth_frame = depths[i]
# Normalize the depth frame to the range [0, 255]
depth_norm = ((depth_frame - d_min) / (d_max - d_min) * 255).astype(np.uint8)
# Create either a grayscale image or apply the inferno colormap, depending on the setting.
if grayscale:
depth_vis = np.stack([depth_norm] * 3, axis=-1)
else:
cmap = cm.get_cmap("inferno")
depth_vis = (cmap(depth_norm / 255.0)[..., :3] * 255).astype(np.uint8)
# Apply Gaussian blur if requested (if blur factor > 0)
if blur > 0:
kernel_size = int(blur * 20) * 2 + 1 # ensures an odd kernel size
depth_vis = cv2.GaussianBlur(depth_vis, (kernel_size, kernel_size), 0)
# Resize the depth visual image to match the full-resolution RGB frame.
H_full, W_full = rgb_full.shape[:2]
depth_vis_resized = cv2.resize(depth_vis, (W_full, H_full))
# Concatenate the full-resolution RGB frame (left) and the resized depth visual (right) side-by-side.
stitched = cv2.hconcat([rgb_full, depth_vis_resized])
stitched_frames.append(stitched)
stitched_frames = np.array(stitched_frames)
stitched_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_stitched.mp4')
save_video(stitched_frames, stitched_video_path, fps=fps)
gc.collect()
torch.cuda.empty_cache()
# Return the processed RGB video, depth visualization, and (if created) the stitched video.
return [processed_video_path, depth_vis_path, stitched_video_path]
def construct_demo():
with gr.Blocks(analytics_enabled=False) as demo:
gr.Markdown(title)
gr.Markdown(description)
gr.Markdown("### If you find this work useful, please help ⭐ the [Github Repo](https://github.com/DepthAnything/Video-Depth-Anything). Thanks for your attention!")
with gr.Row(equal_height=True):
with gr.Column(scale=1):
# Use the Video component for file upload (without specifying 'source')
input_video = gr.Video(label="Input Video")
with gr.Column(scale=2):
with gr.Row(equal_height=True):
processed_video = gr.Video(label="Preprocessed Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
depth_vis_video = gr.Video(label="Generated Depth Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
stitched_video = gr.Video(label="Stitched RGBD Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
with gr.Row(equal_height=True):
with gr.Column(scale=1):
with gr.Accordion("Advanced Settings", open=False):
max_len = gr.Slider(label="Max process length", minimum=-1, maximum=1000, value=500, step=1)
target_fps = gr.Slider(label="Target FPS", minimum=-1, maximum=30, value=15, step=1)
max_res = gr.Slider(label="Max side resolution", minimum=480, maximum=1920, value=1280, step=1)
stitch_option = gr.Checkbox(label="Stitch RGB & Depth Videos", value=False)
grayscale_option = gr.Checkbox(label="Output Depth as Grayscale", value=False)
blur_slider = gr.Slider(minimum=0, maximum=1, step=0.01, label="Depth Blur Factor", value=0)
generate_btn = gr.Button("Generate")
with gr.Column(scale=2):
pass
gr.Examples(
examples=examples,
inputs=[input_video, max_len, target_fps, max_res, stitch_option, grayscale_option, blur_slider],
outputs=[processed_video, depth_vis_video, stitched_video],
fn=infer_video_depth,
cache_examples=True,
cache_mode="lazy",
)
generate_btn.click(
fn=infer_video_depth,
inputs=[input_video, max_len, target_fps, max_res, stitch_option, grayscale_option, blur_slider],
outputs=[processed_video, depth_vis_video, stitched_video],
)
return demo
if __name__ == "__main__":
demo = construct_demo()
demo.queue() # Enable asynchronous processing
demo.launch(share=True)